rm(list = ls(all = TRUE))
gc()
##           used (Mb) gc trigger (Mb) max used (Mb)
## Ncells  625036 33.4    1434327 76.7   702048 37.5
## Vcells 1170523  9.0    8388608 64.0  1927980 14.8
library(magrittr)
library(data.table)
library(knitr)
library(ggplot2)
library(ComplexUpset)


`%!in%` = Negate(`%in%`)
`%nin%` = Negate(`%in%`)

setwd(dirname(rstudioapi::getActiveDocumentContext()$path))

1 Ath gmm

fp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
ath.gmm = gmm[gmm$plant == 'ath', ]
setDT(ath.gmm)
ath.gmm[, plant := NULL]
ath.gmm[, source := NULL]
colnames(ath.gmm)[2:4] = paste('ath', colnames(ath.gmm)[2:4], sep = '_')

2 Ath SKM & annotation

note: some duplicated ids in PSS

fp = file.path('..', 'input', 'ath-annot', 'Phytozome', 'PhytozomeV12', 
               'early_release', 'Athaliana_447_Araport11', 'annotation')
# fn = 'Araport11_GFF3_genes_transposons.current_utf8_attributes_CB.tsv'
fn = 'Athaliana_447_Araport11.geneName.txt'
gn = data.table::fread(file.path(fp, fn), header = FALSE, fill = TRUE)
colnames(gn)[2] = 'athName'
gn$V1 = sub('\\..*', '', gn$V1)
gn = gn[!duplicated(gn), ]


fn = 'Athaliana_447_Araport11.synonym.txt'
sn = data.table::fread(file.path(fp, fn), header = FALSE, fill = TRUE)
sn[, merged_column := apply(.SD, 1, function(x) {
  # Remove NA and empty strings
  x = x[!is.na(x) & x != ""]
  paste(x, collapse = " | ")
}), .SDcols = 2:ncol(sn)]
# Optionally, remove the original columns V2 to V15
sn[, (2:(ncol(sn)-1)) := NULL]
colnames(sn)[2] = 'athSynonims'
sn$V1 = sub('\\..*', '', sn$V1)
sn = sn[!duplicated(sn), ]


fp = file.path('..', 'input', 'SKM_2025-07-08')
fn = 'rxn-nodes-public.tsv'
pss = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
ind = grep('^name$|^all_pathways|^short_name$', colnames(pss), value = TRUE)
pss = pss[, ..ind]
ind = grep('\\[', pss$name)
pss = pss[ind, ]

pss[, ids_string := stringr::str_extract(name, "(?<=\\[)[^\\]]+(?=\\])")]
pss[, ids_list := strsplit(ids_string, split = ",")]
max_ids = max(lengths(pss$ids_list))
for (i in seq_len(max_ids)) {
  pss[[paste0("id_", i)]] = sapply(pss$ids_list, function(x) ifelse(length(x) >= i, x[i], NA))
}
pss[, c("ids_string", "ids_list") := NULL]

pss_long = melt(
  pss,
  id.vars = c("name", "all_pathways", 'short_name'),       # Columns to keep as is
  measure.vars = patterns("^id_"),           # Columns to melt (all starting with "id_")
  variable.name = "id_num",                   # Name for the melted variable column
  value.name = "id"                           # Name for the melted value column
)

pss_long = pss_long[!is.na(id) & id != ""]
pss_long[, id_num := NULL]
pss_long[, name := NULL]
pss_long$id = sub('\\..*', '', pss_long$id)
pss_long = pss_long[!duplicated(pss_long), ]
table(duplicated(pss_long$id))
## 
## FALSE  TRUE 
##   816    24
pss_long %>%
  dplyr::filter(id %in% id[duplicated(id)] & stringr::str_starts(id, "^AT")) %>%
  dplyr::arrange(id) %>%
  print()
##                                              all_pathways short_name        id
##                                                    <char>     <char>    <char>
##  1:                               Hormone - Ethylene (ET)      ORA59 AT1G06160
##  2:                               Hormone - Ethylene (ET)  ERF/ORA59 AT1G06160
##  3:                         Hormone - Salicylic acid (SA)       TCP8 AT1G58100
##  4:                         Hormone - Salicylic acid (SA) TCP8,14,15 AT1G58100
##  5:                               Hormone - Ethylene (ET)       EDF2 AT1G68840
##  6:                               Hormone - Ethylene (ET)    ERF/EDF AT1G68840
##  7: Signalling - Heat-shock proteins (HSPs),Stress - Heat      HSP70 AT3G12580
##  8:               Signalling - Heat-shock proteins (HSPs)        HSP AT3G12580
##  9:                               Hormone - Ethylene (ET)       ERF1 AT3G23240
## 10:                               Hormone - Ethylene (ET)    ERF/EDF AT3G23240
## 11:                               Hormone - Ethylene (ET)       ERF6 AT4G17490
## 12:                               Hormone - Ethylene (ET)  ERF/ORA59 AT4G17490
## 13:                               Hormone - Ethylene (ET)       ERF1 AT4G17500
## 14:                               Hormone - Ethylene (ET)    ERF/EDF AT4G17500
## 15:               Signalling - Heat-shock proteins (HSPs)     MED37E AT5G02500
## 16:               Signalling - Heat-shock proteins (HSPs)        HSP AT5G02500
## 17:                               Hormone - Ethylene (ET)     ERF096 AT5G43410
## 18:                               Hormone - Ethylene (ET)    ERF/EDF AT5G43410
## 19:                               Hormone - Ethylene (ET)       ERF5 AT5G47230
## 20:                               Hormone - Ethylene (ET)  ERF/ORA59 AT5G47230
## 21:                               Hormone - Ethylene (ET)     ERF105 AT5G51190
## 22:                               Hormone - Ethylene (ET)  ERF/ORA59 AT5G51190
## 23:               Signalling - Heat-shock proteins (HSPs) HSP18.1-CI AT5G59720
## 24:               Signalling - Heat-shock proteins (HSPs)        HSP AT5G59720
## 25:                               Hormone - Ethylene (ET)     ERF104 AT5G61600
## 26:                               Hormone - Ethylene (ET)    ERF/EDF AT5G61600
##                                              all_pathways short_name        id
pss_long = pss_long %>%
  dplyr::filter(stringr::str_starts(id, "AT")) %>%
  dplyr::group_by(id) %>%
  dplyr::summarise(
    dplyr::across(
      .cols = dplyr::everything(),
      .fns = ~ {
        vals = unique(na.omit(.))
        if (length(vals) > 1) paste(vals, collapse = " | ")
        else if (length(vals) == 1) vals
        else NA_character_
      }
    ),
    .groups = "drop"
  )
group = "others"

Note: be careful with 35.2 bin matches

params_list <- list(
  plantName = 'stu'
  , plantNameOut = "potato"
  , plantDirOut = file.path('..', 'reports', group, "potato")
  , flag = 1
)


env <- new.env()
list2env(params_list, envir = env)

<environment: 0x000001da9e61aca8>

child_content <- knitr::knit_child("09_translate-child.Rmd", envir = env, quiet = FALSE)
## 
## 
## processing file: ./09_translate-child.Rmd

| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-11] | |…… | 11% | |…….. | 15% [unnamed-chunk-12] | |……… | 19% | |……….. | 22% [unnamed-chunk-13] | |…………. | 26% | |…………… | 30% [unnamed-chunk-14] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-15] | |………………… | 41% | |………………….. | 44% [unnamed-chunk-16] | |……………………. | 48% | |…………………….. | 52% [unnamed-chunk-17] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-18] | |………………………….. | 63% | |……………………………. | 67% [unnamed-chunk-19] | |……………………………… | 70% | |……………………………….. | 74% [unnamed-chunk-20] | |…………………………………. | 78% | |…………………………………… | 81% [unnamed-chunk-21] | |……………………………………. | 85% | |……………………………………… | 89% [unnamed-chunk-22] | |……………………………………….. | 93% | |…………………………………………. | 96% [unnamed-chunk-23] | |……………………………………………| 100%

cat(child_content)

3 Subsection: stu

if (!dir.exists(plantDirOut)) dir.create(plantDirOut, recursive = TRUE)

3.1 Ortho sources

fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]

df = NULL

for (i in fl){
  
  print(i)
  
  dt = data.table::fread(i)
  us = unique(dt$source)
  
  if(us == 'compara') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'FastOMA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'MCScanX') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'PLAZA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'OrthoDB') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'RBH') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  }   else cat ('Ignore source(s):', us, '\n')
}
## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator 
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
table(df$source)
## 
## compara FastOMA MCScanX OrthoDB   PLAZA     RBH 
##   15035   75961   17547   51397   23243   33885
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID" 


df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 24 × 5
##    from_geneID from_plant to_geneID            to_plant source 
##    <chr>       <chr>      <chr>                <chr>    <chr>  
##  1 AT2G37600   ath        PGSC0003DMG400000168 stu      FastOMA
##  2 AT3G53740   ath        PGSC0003DMG400000168 stu      FastOMA
##  3 AT3G23880   ath        Sotub12g029980       stu      FastOMA
##  4 AT2G13770   ath        Sotub12g032820       stu      FastOMA
##  5 AT1G01220   ath        Soltu.DM.01G035280   stu      MCScanX
##  6 AT1G01225   ath        Soltu.DM.01G035270   stu      MCScanX
##  7 ATMG01080   ath        PGSC0003DMG400002202 stu      MCScanX
##  8 ATMG01110   ath        Soltu.DM.12G017350   stu      MCScanX
##  9 AT1G56560   ath        Soltu.DM.01G018690   stu      OrthoDB
## 10 AT3G06500   ath        Soltu.DM.01G018690   stu      OrthoDB
## 11 AT5G47320   ath        Sotub01g015810       stu      OrthoDB
## 12 AT2G28815   ath        Soltu.DM.01G010300   stu      OrthoDB
## 13 AT1G61070   ath        Soltu.DM.01G000020   stu      PLAZA  
## 14 AT2G02100   ath        Soltu.DM.01G000020   stu      PLAZA  
## 15 AT4G28140   ath        PGSC0003DMG400041562 stu      PLAZA  
## 16 AT3G15353   ath        PGSC0003DMG400015318 stu      PLAZA  
## 17 AT1G01030   ath        Soltu.DM.08G005020   stu      RBH    
## 18 AT1G01030   ath        Soltu.DM.08G005030   stu      RBH    
## 19 ATMG01360   ath        Sotub03g026800       stu      RBH    
## 20 ATMG01360   ath        Sotub07g016160       stu      RBH    
## 21 AT5G01020   ath        Soltu.DM.10G018260   stu      compara
## 22 AT5G01030   ath        Soltu.DM.09G001170   stu      compara
## 23 AT1G80950   ath        Soltu.DM.04G034970   stu      compara
## 24 AT1G80980   ath        Soltu.DM.03G031250   stu      compara
df = df[!duplicated(df), ]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "gn", "sn", "pss_long", 
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut",
                          "pattern_in", 
                          "pattern_out", 
                          "mercator", 
                          "mercatorPatternIn1", 
                          "mercatorPatternOut1", 
                          "mercatorPatternIn2", 
                          "mercatorPatternOut2",
                          "flag")))




gc()
##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  2241282 119.7    4353742 232.6  3000752 160.3
## Vcells 19789277 151.0   32425744 247.4 32213056 245.8

3.2 To wide format

dt = df
length(unique(dt$from_geneID))
## [1] 23082
length(unique(dt$to_geneID))
## [1] 30489
table(dt$source)
## 
## compara FastOMA MCScanX OrthoDB   PLAZA     RBH 
##   15035   75961   17547   51397   23243   33885
dt[, present := TRUE]

dt.wide = dcast(dt, from_geneID + to_geneID ~ source, value.var = "present", fill = FALSE)

dt.wide = dt.wide[order(dt.wide$from_geneID, dt.wide$to_geneID), ]

3.3 Upset plot

if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) {
  source_cols = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  source_cols = c("MCScanX", 'RBH', "FastOMA")
}


dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]

hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))

dff = as.data.frame(dt.wide)

upset_plot = upset(
  dff,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods") +
  theme(legend.position = "none")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the ComplexUpset package.
##   Please report the issue at
##   <https://github.com/krassowski/complex-upset/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Print or/and save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf") # change name

3.4 Gene occurence

# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)

ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)), 
        grep('from_count', colnames(dt.wide)),
        grep('to_count', colnames(dt.wide)), 
        grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]

3.5 In/out PSS

df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)

df = merge(df, gn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) # 
df = merge(df, sn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) # 

df = merge(df, pss_long, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)

nin = pss_long[which(!(pss_long$id %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
nin = merge(nin, gn, by.x = 'id', by.y = 'V1', all.x = TRUE)
nin = merge(nin, sn, by.x = 'id', by.y = 'V1', all.x = TRUE)

openxlsx::write.xlsx(nin, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-15.xlsx'), 
                     asTable = TRUE) # change name

3.6 fruitTrees plant gmm

fp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]

colnames(combined)[2:4] = paste('fruitTrees', colnames(combined)[2:4], sep = '_')

colnames(df)
##  [1] "from_geneID"     "to_geneID"       "FastOMA"         "MCScanX"        
##  [5] "OrthoDB"         "PLAZA"           "RBH"             "compara"        
##  [9] "from_count"      "to_count"        "count_evidence"  "ath_BINCODE"    
## [13] "ath_NAME"        "ath_DESCRIPTION" "athName"         "athSynonims"    
## [17] "all_pathways"    "short_name"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$fruitTrees_BINCODE))
## 
##  FALSE   TRUE 
## 115350    428
dt[is.na(dt$fruitTrees_BINCODE), ]$to_geneID # check ones with strange ID
##   [1] "PGSC0003DMG400002971" "PGSC0003DMG401011339" "PGSC0003DMG401011339"
##   [4] "Soltu.DM.S000060"     "Soltu.DM.S000060"     "Soltu.DM.S000060"    
##   [7] "Soltu.DM.S000060"     "Soltu.DM.S000060"     "Soltu.DM.S000060"    
##  [10] "Soltu.DM.S000060"     "Soltu.DM.S000070"     "Soltu.DM.S000070"    
##  [13] "Soltu.DM.S000100"     "Soltu.DM.S000110"     "Soltu.DM.S000120"    
##  [16] "Soltu.DM.S000120"     "Soltu.DM.S000140"     "Soltu.DM.S000140"    
##  [19] "Soltu.DM.S000160"     "Soltu.DM.S000160"     "Soltu.DM.S000170"    
##  [22] "Soltu.DM.S000170"     "Soltu.DM.S000180"     "Soltu.DM.S000210"    
##  [25] "Soltu.DM.S000210"     "Soltu.DM.S000240"     "Soltu.DM.S000240"    
##  [28] "Soltu.DM.S000240"     "Soltu.DM.S000270"     "Soltu.DM.S000280"    
##  [31] "Soltu.DM.S000290"     "Soltu.DM.S000290"     "Soltu.DM.S000320"    
##  [34] "Soltu.DM.S000330"     "Soltu.DM.S000360"     "Soltu.DM.S000360"    
##  [37] "Soltu.DM.S000420"     "Soltu.DM.S000420"     "Soltu.DM.S000430"    
##  [40] "Soltu.DM.S000430"     "Soltu.DM.S000510"     "Soltu.DM.S000510"    
##  [43] "Soltu.DM.S000510"     "Soltu.DM.S000510"     "Soltu.DM.S000510"    
##  [46] "Soltu.DM.S000510"     "Soltu.DM.S000510"     "Soltu.DM.S000510"    
##  [49] "Soltu.DM.S000510"     "Soltu.DM.S000510"     "Soltu.DM.S000510"    
##  [52] "Soltu.DM.S000510"     "Soltu.DM.S000510"     "Soltu.DM.S000520"    
##  [55] "Soltu.DM.S000520"     "Soltu.DM.S000780"     "Soltu.DM.S000780"    
##  [58] "Soltu.DM.S000830"     "Soltu.DM.S000840"     "Soltu.DM.S000850"    
##  [61] "Soltu.DM.S000850"     "Soltu.DM.S000850"     "Soltu.DM.S000850"    
##  [64] "Soltu.DM.S000850"     "Soltu.DM.S000850"     "Soltu.DM.S000850"    
##  [67] "Soltu.DM.S000850"     "Soltu.DM.S000850"     "Soltu.DM.S000850"    
##  [70] "Soltu.DM.S000850"     "Soltu.DM.S000850"     "Soltu.DM.S000870"    
##  [73] "Soltu.DM.S000890"     "Soltu.DM.S000920"     "Soltu.DM.S000950"    
##  [76] "Soltu.DM.S000980"     "Soltu.DM.S000990"     "Soltu.DM.S000990"    
##  [79] "Soltu.DM.S001000"     "Soltu.DM.S001000"     "Soltu.DM.S001000"    
##  [82] "Soltu.DM.S001000"     "Soltu.DM.S001100"     "Soltu.DM.S001120"    
##  [85] "Soltu.DM.S001120"     "Soltu.DM.S001120"     "Soltu.DM.S001120"    
##  [88] "Soltu.DM.S001130"     "Soltu.DM.S001130"     "Soltu.DM.S001130"    
##  [91] "Soltu.DM.S001130"     "Soltu.DM.S001130"     "Soltu.DM.S001130"    
##  [94] "Soltu.DM.S001130"     "Soltu.DM.S001130"     "Soltu.DM.S001130"    
##  [97] "Soltu.DM.S001130"     "Soltu.DM.S001130"     "Soltu.DM.S001130"    
## [100] "Soltu.DM.S001130"     "Soltu.DM.S001130"     "Soltu.DM.S001130"    
## [103] "Soltu.DM.S001270"     "Soltu.DM.S001270"     "Soltu.DM.S001270"    
## [106] "Soltu.DM.S001280"     "Soltu.DM.S001300"     "Soltu.DM.S001340"    
## [109] "Soltu.DM.S001340"     "Soltu.DM.S001350"     "Soltu.DM.S001470"    
## [112] "Soltu.DM.S001470"     "Soltu.DM.S001520"     "Soltu.DM.S001520"    
## [115] "Soltu.DM.S001520"     "Soltu.DM.S001520"     "Soltu.DM.S001540"    
## [118] "Soltu.DM.S001540"     "Soltu.DM.S001600"     "Soltu.DM.S001650"    
## [121] "Soltu.DM.S001650"     "Soltu.DM.S001650"     "Soltu.DM.S001650"    
## [124] "Soltu.DM.S001650"     "Soltu.DM.S001650"     "Soltu.DM.S001650"    
## [127] "Soltu.DM.S001650"     "Soltu.DM.S001650"     "Soltu.DM.S001650"    
## [130] "Soltu.DM.S001650"     "Soltu.DM.S001650"     "Soltu.DM.S001650"    
## [133] "Soltu.DM.S001650"     "Soltu.DM.S001650"     "Soltu.DM.S001650"    
## [136] "Soltu.DM.S001650"     "Soltu.DM.S001650"     "Soltu.DM.S001650"    
## [139] "Soltu.DM.S001650"     "Soltu.DM.S001650"     "Soltu.DM.S001650"    
## [142] "Soltu.DM.S001650"     "Soltu.DM.S001660"     "Soltu.DM.S001660"    
## [145] "Soltu.DM.S001660"     "Soltu.DM.S001660"     "Soltu.DM.S001660"    
## [148] "Soltu.DM.S001660"     "Soltu.DM.S001660"     "Soltu.DM.S001660"    
## [151] "Soltu.DM.S001660"     "Soltu.DM.S001660"     "Soltu.DM.S001660"    
## [154] "Soltu.DM.S001660"     "Soltu.DM.S001660"     "Soltu.DM.S001660"    
## [157] "Soltu.DM.S001660"     "Soltu.DM.S001660"     "Soltu.DM.S001660"    
## [160] "Soltu.DM.S001660"     "Soltu.DM.S001660"     "Soltu.DM.S001660"    
## [163] "Soltu.DM.S001660"     "Soltu.DM.S001660"     "Soltu.DM.S001660"    
## [166] "Soltu.DM.S001670"     "Soltu.DM.S001670"     "Soltu.DM.S001670"    
## [169] "Soltu.DM.S001670"     "Soltu.DM.S001670"     "Soltu.DM.S001670"    
## [172] "Soltu.DM.S001670"     "Soltu.DM.S001670"     "Soltu.DM.S001670"    
## [175] "Soltu.DM.S001670"     "Soltu.DM.S001670"     "Soltu.DM.S001670"    
## [178] "Soltu.DM.S001670"     "Soltu.DM.S001670"     "Soltu.DM.S001670"    
## [181] "Soltu.DM.S001670"     "Soltu.DM.S001670"     "Soltu.DM.S001670"    
## [184] "Soltu.DM.S001670"     "Soltu.DM.S001670"     "Soltu.DM.S001670"    
## [187] "Soltu.DM.S001670"     "Soltu.DM.S001670"     "Soltu.DM.S001680"    
## [190] "Soltu.DM.S001680"     "Soltu.DM.S001680"     "Soltu.DM.S001680"    
## [193] "Soltu.DM.S001680"     "Soltu.DM.S001680"     "Soltu.DM.S001680"    
## [196] "Soltu.DM.S001680"     "Soltu.DM.S001680"     "Soltu.DM.S001680"    
## [199] "Soltu.DM.S001680"     "Soltu.DM.S001680"     "Soltu.DM.S001680"    
## [202] "Soltu.DM.S001680"     "Soltu.DM.S001680"     "Soltu.DM.S001680"    
## [205] "Soltu.DM.S001680"     "Soltu.DM.S001680"     "Soltu.DM.S001680"    
## [208] "Soltu.DM.S001680"     "Soltu.DM.S001680"     "Soltu.DM.S001680"    
## [211] "Soltu.DM.S001680"     "Soltu.DM.S001700"     "Soltu.DM.S001700"    
## [214] "Soltu.DM.S001700"     "Soltu.DM.S001700"     "Soltu.DM.S001700"    
## [217] "Soltu.DM.S001700"     "Soltu.DM.S001700"     "Soltu.DM.S001700"    
## [220] "Soltu.DM.S001740"     "Soltu.DM.S001740"     "Soltu.DM.S001740"    
## [223] "Soltu.DM.S001740"     "Soltu.DM.S001740"     "Soltu.DM.S001740"    
## [226] "Soltu.DM.S001740"     "Soltu.DM.S001740"     "Soltu.DM.S001740"    
## [229] "Soltu.DM.S001740"     "Soltu.DM.S001740"     "Soltu.DM.S001740"    
## [232] "Soltu.DM.S001740"     "Soltu.DM.S001740"     "Soltu.DM.S001750"    
## [235] "Soltu.DM.S001770"     "Soltu.DM.S001810"     "Soltu.DM.S001840"    
## [238] "Soltu.DM.S001840"     "Soltu.DM.S001840"     "Soltu.DM.S001840"    
## [241] "Soltu.DM.S001840"     "Soltu.DM.S001840"     "Soltu.DM.S001840"    
## [244] "Soltu.DM.S001840"     "Soltu.DM.S001840"     "Soltu.DM.S001840"    
## [247] "Soltu.DM.S001840"     "Soltu.DM.S001840"     "Soltu.DM.S001840"    
## [250] "Soltu.DM.S001840"     "Soltu.DM.S001840"     "Soltu.DM.S001840"    
## [253] "Soltu.DM.S001840"     "Soltu.DM.S001840"     "Soltu.DM.S001840"    
## [256] "Soltu.DM.S001840"     "Soltu.DM.S001840"     "Soltu.DM.S001840"    
## [259] "Soltu.DM.S001840"     "Soltu.DM.S001850"     "Soltu.DM.S001850"    
## [262] "Soltu.DM.S001850"     "Soltu.DM.S001850"     "Soltu.DM.S001850"    
## [265] "Soltu.DM.S001850"     "Soltu.DM.S001850"     "Soltu.DM.S001850"    
## [268] "Soltu.DM.S001850"     "Soltu.DM.S001850"     "Soltu.DM.S001850"    
## [271] "Soltu.DM.S001850"     "Soltu.DM.S001850"     "Soltu.DM.S001850"    
## [274] "Soltu.DM.S001850"     "Soltu.DM.S001850"     "Soltu.DM.S001850"    
## [277] "Soltu.DM.S001850"     "Soltu.DM.S001850"     "Soltu.DM.S001850"    
## [280] "Soltu.DM.S001850"     "Soltu.DM.S001850"     "Soltu.DM.S001850"    
## [283] "Soltu.DM.S001860"     "Soltu.DM.S001860"     "Soltu.DM.S001860"    
## [286] "Soltu.DM.S001860"     "Soltu.DM.S001860"     "Soltu.DM.S001860"    
## [289] "Soltu.DM.S001860"     "Soltu.DM.S001860"     "Soltu.DM.S001860"    
## [292] "Soltu.DM.S001860"     "Soltu.DM.S001860"     "Soltu.DM.S001860"    
## [295] "Soltu.DM.S001860"     "Soltu.DM.S001860"     "Soltu.DM.S001860"    
## [298] "Soltu.DM.S001860"     "Soltu.DM.S001860"     "Soltu.DM.S001860"    
## [301] "Soltu.DM.S001860"     "Soltu.DM.S001860"     "Soltu.DM.S001860"    
## [304] "Soltu.DM.S001860"     "Soltu.DM.S001860"     "Soltu.DM.S001870"    
## [307] "Soltu.DM.S001870"     "Soltu.DM.S001870"     "Soltu.DM.S001870"    
## [310] "Soltu.DM.S001870"     "Soltu.DM.S001870"     "Soltu.DM.S001870"    
## [313] "Soltu.DM.S001870"     "Soltu.DM.S001870"     "Soltu.DM.S001870"    
## [316] "Soltu.DM.S001870"     "Soltu.DM.S001870"     "Soltu.DM.S001870"    
## [319] "Soltu.DM.S001870"     "Soltu.DM.S001870"     "Soltu.DM.S001870"    
## [322] "Soltu.DM.S001870"     "Soltu.DM.S001870"     "Soltu.DM.S001870"    
## [325] "Soltu.DM.S001870"     "Soltu.DM.S001870"     "Soltu.DM.S001870"    
## [328] "Soltu.DM.S001870"     "Soltu.DM.S001970"     "Soltu.DM.S001970"    
## [331] "Soltu.DM.S001970"     "Soltu.DM.S001970"     "Soltu.DM.S002030"    
## [334] "Soltu.DM.S002030"     "Soltu.DM.S002050"     "Soltu.DM.S002050"    
## [337] "Soltu.DM.S002090"     "Soltu.DM.S002100"     "Soltu.DM.S002100"    
## [340] "Soltu.DM.S002100"     "Soltu.DM.S002140"     "Soltu.DM.S002160"    
## [343] "Soltu.DM.S002160"     "Soltu.DM.S002160"     "Soltu.DM.S002160"    
## [346] "Soltu.DM.S002180"     "Soltu.DM.S002180"     "Soltu.DM.S002200"    
## [349] "Soltu.DM.S002200"     "Soltu.DM.S002200"     "Soltu.DM.S002200"    
## [352] "Soltu.DM.S002210"     "Soltu.DM.S002210"     "Soltu.DM.S002210"    
## [355] "Soltu.DM.S002210"     "Soltu.DM.S002220"     "Soltu.DM.S002220"    
## [358] "Soltu.DM.S002220"     "Soltu.DM.S002220"     "Soltu.DM.S002230"    
## [361] "Soltu.DM.S002230"     "Soltu.DM.S002230"     "Soltu.DM.S002230"    
## [364] "Soltu.DM.S002230"     "Soltu.DM.S002230"     "Soltu.DM.S002230"    
## [367] "Soltu.DM.S002230"     "Soltu.DM.S002230"     "Soltu.DM.S002230"    
## [370] "Soltu.DM.S002230"     "Soltu.DM.S002230"     "Soltu.DM.S002230"    
## [373] "Soltu.DM.S002230"     "Soltu.DM.S002260"     "Soltu.DM.S002260"    
## [376] "Soltu.DM.S002260"     "Soltu.DM.S002260"     "Soltu.DM.S002260"    
## [379] "Soltu.DM.S002260"     "Soltu.DM.S002330"     "Soltu.DM.S002330"    
## [382] "Soltu.DM.S002340"     "Soltu.DM.S002350"     "Soltu.DM.S002350"    
## [385] "Soltu.DM.S002350"     "Soltu.DM.S002350"     "Soltu.DM.S002350"    
## [388] "Soltu.DM.S002350"     "Soltu.DM.S002350"     "Soltu.DM.S002350"    
## [391] "Soltu.DM.S002350"     "Soltu.DM.S002420"     "Soltu.DM.S002420"    
## [394] "Soltu.DM.S002440"     "Soltu.DM.S002460"     "Soltu.DM.S002490"    
## [397] "Soltu.DM.S002490"     "Soltu.DM.S002490"     "Soltu.DM.S002490"    
## [400] "Soltu.DM.S002490"     "Soltu.DM.S002490"     "Soltu.DM.S002490"    
## [403] "Soltu.DM.S002510"     "Soltu.DM.S002640"     "Soltu.DM.S002640"    
## [406] "Soltu.DM.S002650"     "Soltu.DM.S002650"     "Soltu.DM.S002650"    
## [409] "Soltu.DM.S002650"     "Soltu.DM.S002650"     "Soltu.DM.S002650"    
## [412] "Soltu.DM.S002650"     "Soltu.DM.S002650"     "Soltu.DM.S002650"    
## [415] "Soltu.DM.S002650"     "Soltu.DM.S002650"     "Soltu.DM.S002650"    
## [418] "Soltu.DM.S002650"     "Soltu.DM.S002650"     "Soltu.DM.S002650"    
## [421] "Soltu.DM.S002650"     "Soltu.DM.S002650"     "Soltu.DM.S002650"    
## [424] "Soltu.DM.S002650"     "Soltu.DM.S002650"     "Soltu.DM.S002650"    
## [427] "Soltu.DM.S002650"     "Soltu.DM.S002650"
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]
rm(list = setdiff(ls(), c("df", 
                          "ath.gmm", "gn", "sn", "pss_long",  
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut", 
                          "pattern_in", 
                          "pattern_out", 
                          "mercator", 
                          "mercatorPatternIn1", 
                          "mercatorPatternOut1", 
                          "mercatorPatternIn2", 
                          "mercatorPatternOut2",
                          "flag")))


gc()
##             used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells   7864733 420.1   14129500  754.6  14129500  754.6
## Vcells 120824189 921.9  189394518 1445.0 189394518 1445.0

3.7 Translation table

MapMan Mercator matches: first three levels only

df = df[!duplicated(df), ]


compare_bin <- function(athMercator, plantXMercator) {
  # split string by | then by ; and trim tokens,
  # then truncate each token to first three dot-separated levels
  split_tokens = function(code) {
    if(is.na(code) || code == "") return(character(0))
    parts = stringr::str_split(code, "\\|", simplify = TRUE)
    tokens = unlist(lapply(parts, function(p) stringr::str_split(p, ";", simplify = TRUE)))
    tokens = unique(stringr::str_trim(tokens))
    
    # For each token, extract first 3 dot levels
    trunc3levels = function(token) {
      levels = unlist(stringr::str_split(token, "\\."))
      if(length(levels) > 3) {
        levels = levels[1:3]
      }
      paste(levels, collapse = ".")
    }
    
    truncated_tokens = sapply(tokens, trunc3levels)
    unique(truncated_tokens)
  }
  
  bin_set = split_tokens(athMercator)
  v4_set = split_tokens(plantXMercator)
  
  # Tokens that are common between sets truncated to 3 levels
  common_tokens = intersect(bin_set, v4_set)
  
  # Check if plantXMercator is exact duplication of athMercator token(s) (all plantXMercator tokens equal truncated bin_set token(s))
  v4_parts = stringr::str_split(plantXMercator, "\\|", simplify = TRUE)
  if(length(bin_set) == 1 &&
     length(v4_parts) > 1 &&
     all(split_tokens(plantXMercator) == bin_set)) {
    return(paste0("100% match based on ", bin_set))
  }
  
  # Check if sets are identical
  if(setequal(bin_set, v4_set)) {
    return(paste0("100% match based on ", paste(bin_set, collapse = ", ")))
  }
  
  # Partial match if any tokens overlap, mention those tokens
  if(length(common_tokens) > 0) {
    return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
  }
  
  return("no match")
}



df = df %>%
  dplyr::rowwise() %>%
  dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, fruitTrees_BINCODE)) %>% # change name 
  dplyr::ungroup()

3.8 Filter

# now

cat('####  ####  before filter ####  ####  \n')
## ####  ####  before filter ####  ####
length(unique(df$from_geneID))
## [1] 23082
length(unique(df$to_geneID))
## [1] 30489
range(df$from_count)
## [1]  1 88
range(df$to_count)
## [1]  1 59
length(unique(df$from_geneID[df$from_count > 30]))
## [1] 473
length(unique(df$to_geneID[df$to_count > 30]))
## [1] 310
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
covered_genes = character()


if (flag == 1) {
  methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) {  # make flags
  methods = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
  methods = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  methods = c("MCScanX", 'RBH', "FastOMA")
}


match_categories = c("no match", "100% match based", "partial match")

long_dt = data.table::rbindlist(lapply(methods, function(method) {
  dt[, .(
    Method = method,
    Match_Type = c("no match", "100% match based", "partial match"),
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
    )
  )]
}), use.names = TRUE)

long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]

ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match        no match partial match  
##          88473          17365           9940
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  no match partial match 
##   1       48052    15276           7945
##   2       14398     1554           1200
##   3        9422      355            402
##   4        7457      123            230
##   5        6299       51            120
##   6        2845        6             43
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))

tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))

ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Frequency of count_evidence by MapMan4_Match",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")




if (flag != 4) {
  special_methods = c("OrthoDB", "RBH", "FastOMA")
} else {
  special_methods = c("RBH", "FastOMA")
}

# Initialize a named vector to count method_MapMan4 assignments
mapman4_counts = setNames(rep(0, length(special_methods)), paste0(special_methods, "_MapMan4"))

for (method in methods) {

  base_cond = dt$filter_criteria == "reject" & dt[[method]] == TRUE & 
               !(dt$to_geneID %in% covered_genes) & !(dt$from_geneID %in% covered_genes)
  add_cond = rep(TRUE, nrow(dt))
  
  if (method %in% special_methods) {
    add_cond = rep(TRUE, nrow(dt))
  }
  
  candidates = which(base_cond & add_cond)
  
  if (length(candidates) > 0) {
    if (method %in% special_methods) {
      for (i in candidates) {
        row = dt[i]
        covered_by = special_methods[sapply(special_methods, function(m) row[[m]] == TRUE)]
        count_covered = length(covered_by)
        
        is_candidate = FALSE
        new_criteria = NULL
        
        if (count_covered == 3) {
          is_candidate = TRUE
          new_criteria = "OrthoDB_FastOMA_RBH"
        } else if (count_covered == 2) {
          is_candidate = TRUE
          new_criteria = paste(sort(covered_by), collapse = "_")
        } else if (count_covered == 1) {
          # Check MapMan4_Match string contains "match based on" and method name (case-insensitive)
          # reconsider
          # (grepl("match based on", mapman_val, ignore.case = TRUE) &&
          #   !grepl("^100% match based on 35\\.2$", mapman_val)) # for flags 3
          if (grepl("match based on", row$MapMan4_Match, ignore.case = TRUE)) {
            is_candidate = TRUE
            new_criteria = paste0(method, "_MapMan4")
            
            # Increment count for this mapman4 assignment
            mapman4_counts[[new_criteria]] = mapman4_counts[[new_criteria]] + 1
          }
        }
        
        if (is_candidate) {
          dt[i, filter_criteria := new_criteria]
          # covered_genes = unique(c(covered_genes, row$to_geneID, row$from_geneID))
          covered_genes = unique(c(covered_genes, row$to_geneID))
        }
      }
    } else {
      dt[candidates, filter_criteria := method]
      # covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)], dt[candidates, unique(from_geneID)]))
      covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)]))
    }
  }
}

# After the loop, print checkpoint counts for method_MapMan4 assignments
print("MapMan4 assignment counts per method:")
## [1] "MapMan4 assignment counts per method:"
print(mapman4_counts)
## OrthoDB_MapMan4     RBH_MapMan4 FastOMA_MapMan4 
##            9429            3057            7268
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
table(dt$filter_criteria)
## 
##             compara     FastOMA_MapMan4     FastOMA_OrthoDB         FastOMA_RBH 
##                6740                7268                3430                1539 
##             MCScanX OrthoDB_FastOMA_RBH     OrthoDB_MapMan4         OrthoDB_RBH 
##               17547                1952                9429                1244 
##               PLAZA         RBH_MapMan4              reject 
##                6084                3057               57488
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
df = dt

data.table::fwrite(df, 
                   paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.txt'), 
                   sep = '\t')
openxlsx::write.xlsx(df, 
                     paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.xlsx'), 
                     asTable = TRUE)

3.9 Filtered

rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]


# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]

kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]





par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "df$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "kept$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = "df$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = "kept$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)

long_kept = data.table::rbindlist(lapply(methods, function(method) {
  kept[, .(
    Method = method,
    Match_Type = c("no match", "100% match based", "partial match"),
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
    )
  )]
}), use.names = TRUE)

long_kept[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]

ggplot2::ggplot(long_kept, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method (after filter)",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(kept), value = TRUE)] 
keptsub$MapMan4_Match = sub('based on.*', '', keptsub$MapMan4_Match)
table(keptsub$MapMan4_Match)
## 
##    100% match        no match partial match  
##          52144           2842           3304
table(keptsub$count_evidence, keptsub$MapMan4_Match)
##    
##     100% match  no match partial match 
##   1       21079     1176           1934
##   2        8892     1202            726
##   3        6779      295            271
##   4        6513      113            214
##   5        6036       50            116
##   6        2845        6             43
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))

tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))

ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Frequency of count_evidence by MapMan4_Match (after filter)",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria", 
                                     names(kept), value = TRUE)] 
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match', '100% match'))
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, levels = c("MCScanX", "compara", "PLAZA",
                                                    "OrthoDB_FastOMA_RBH",
                                                    "FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH", 
                                                    "OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
                                                    ))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', 'partial match', '100% match'))]


ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
  labs(
    title = "Frequency by MapMan4_Match (after filter)",
    x = "KG Criteria",
    y = "Frequency",
    fill = "MapMan4 Match"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    panel.border = element_rect(color = "black", fill = NA, size = 1),  # border around each facet
  )
## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


openxlsx::write.xlsx(rejected, 
                     paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-removed_2025-09-15.xlsx'), 
                     asTable = TRUE)


edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
  geneID = names(comp$membership),
  weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
#  # cleanup
# kept[, c("from_component", "to_component") := NULL]


openxlsx::write.xlsx(kept, 
                     paste0('../output/y_', plantNameOut , '-ath_orthologues-kept_2025-09-15.xlsx'), 
                     asTable = TRUE)


if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) {  # make flags
  source_cols = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  source_cols = c("MCScanX", 'RBH', "FastOMA")
}

# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
  kept,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")

# Print or save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf")



cat('####  ####  after filter ####  ####  \n')
## ####  ####  after filter ####  ####
length(unique(kept$from_geneID))
## [1] 20863
length(unique(kept$to_geneID))
## [1] 28742
range(kept$from_count)
## [1]  1 61
range(kept$to_count)
## [1]  1 45
length(unique(kept$from_geneID[kept$from_count > 30]))
## [1] 111
length(unique(kept$to_geneID[kept$to_count > 30]))
## [1] 38
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####

3.10 PSS kept/rejected

pss_long = pss_long[, grep("id$|all_pathways$|short_name$", colnames(pss_long))]
pss_long = pss_long[!duplicated(pss_long), ]
pss_long = merge(pss_long, 
                 df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria", 
                                          names(dt), value = TRUE)],
                 by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss_long = pss_long[grep('^AT', pss_long$id), ]
pss_long = pss_long[!duplicated(pss_long), ]
table(pss_long$filter_criteria)
## 
##             compara     FastOMA_MapMan4     FastOMA_OrthoDB         FastOMA_RBH 
##                 204                 201                 122                  69 
##             MCScanX OrthoDB_FastOMA_RBH     OrthoDB_MapMan4         OrthoDB_RBH 
##                 859                  88                 222                  42 
##               PLAZA         RBH_MapMan4              reject 
##                 311                 104                2371
openxlsx::write.xlsx(pss_long, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-15.xlsx'), 
                     asTable = TRUE)
params_list <- list(
  plantName = 'sly'
  , plantNameOut = "tomato"
  , plantDirOut = file.path('..', 'reports', group, "tomato")
  , flag = 1
)


env <- new.env()
list2env(params_list, envir = env)

<environment: 0x000001dae5e735f8>

child_content <- knitr::knit_child("09_translate-child.Rmd", envir = env, quiet = FALSE)
## 
## 
## processing file: ./09_translate-child.Rmd

| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-39] | |…… | 11% | |…….. | 15% [unnamed-chunk-40] | |……… | 19% | |……….. | 22% [unnamed-chunk-41] | |…………. | 26% | |…………… | 30% [unnamed-chunk-42] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-43] | |………………… | 41% | |………………….. | 44% [unnamed-chunk-44] | |……………………. | 48% | |…………………….. | 52% [unnamed-chunk-45] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-46] | |………………………….. | 63% | |……………………………. | 67% [unnamed-chunk-47] | |……………………………… | 70% | |……………………………….. | 74% [unnamed-chunk-48] | |…………………………………. | 78% | |…………………………………… | 81% [unnamed-chunk-49] | |……………………………………. | 85% | |……………………………………… | 89% [unnamed-chunk-50] | |……………………………………….. | 93% | |…………………………………………. | 96% [unnamed-chunk-51] | |……………………………………………| 100%

cat(child_content)

4 Subsection: sly

if (!dir.exists(plantDirOut)) dir.create(plantDirOut, recursive = TRUE)

4.1 Ortho sources

fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]

df = NULL

for (i in fl){
  
  print(i)
  
  dt = data.table::fread(i)
  us = unique(dt$source)
  
  if(us == 'compara') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'FastOMA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'MCScanX') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'PLAZA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'OrthoDB') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'RBH') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  }   else cat ('Ignore source(s):', us, '\n')
}
## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator 
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
table(df$source)
## 
## compara FastOMA MCScanX OrthoDB   PLAZA     RBH 
##   14733   54079   17647   42001   22210   26629
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID" 


df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 24 × 5
##    from_geneID from_plant to_geneID      to_plant source 
##    <chr>       <chr>      <chr>          <chr>    <chr>  
##  1 AT1G06160   ath        PRAM_106208    sly      FastOMA
##  2 AT2G31230   ath        PRAM_106208    sly      FastOMA
##  3 AT3G63390   ath        Solyc12T002847 sly      FastOMA
##  4 AT1G55320   ath        Solyc12T002849 sly      FastOMA
##  5 AT1G01180   ath        Solyc01T003002 sly      MCScanX
##  6 AT1G01190   ath        Solyc01T002993 sly      MCScanX
##  7 ATMG01110   ath        Solyc11T001808 sly      MCScanX
##  8 ATMG01330   ath        Solyc11T001788 sly      MCScanX
##  9 AT2G19940   ath        Solyc01T004061 sly      OrthoDB
## 10 AT3G04650   ath        Solyc01T003078 sly      OrthoDB
## 11 AT2G07633   ath        Solyc11T001073 sly      OrthoDB
## 12 AT2G07633   ath        Solyc11T001072 sly      OrthoDB
## 13 AT5G07610   ath        Solyc05T002277 sly      PLAZA  
## 14 AT4G04330   ath        Solyc02T001764 sly      PLAZA  
## 15 AT3G18160   ath        Solyc12T001775 sly      PLAZA  
## 16 AT3G01510   ath        Solyc12T001230 sly      PLAZA  
## 17 AT1G01030   ath        Solyc04T000117 sly      RBH    
## 18 AT1G01030   ath        Solyc08T000375 sly      RBH    
## 19 ATMG01360   ath        Solyc11T001913 sly      RBH    
## 20 ATMG01410   ath        Solyc11T001822 sly      RBH    
## 21 AT3G20015   ath        Solyc04T000286 sly      compara
## 22 AT5G66990   ath        Solyc02T000133 sly      compara
## 23 AT1G69770   ath        Solyc12T002848 sly      compara
## 24 AT1G55350   ath        Solyc12T002850 sly      compara
df = df[!duplicated(df), ]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "gn", "sn", "pss_long", 
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut",
                          "pattern_in", 
                          "pattern_out", 
                          "mercator", 
                          "mercatorPatternIn1", 
                          "mercatorPatternOut1", 
                          "mercatorPatternIn2", 
                          "mercatorPatternOut2",
                          "flag")))




gc()
##             used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells   5900150 315.2   11303600  603.7  14129500  754.6
## Vcells 125408013 956.8  189394518 1445.0 189394518 1445.0

4.2 To wide format

dt = df
length(unique(dt$from_geneID))
## [1] 22707
length(unique(dt$to_geneID))
## [1] 23599
table(dt$source)
## 
## compara FastOMA MCScanX OrthoDB   PLAZA     RBH 
##   14733   54079   17647   42001   22210   26629
dt[, present := TRUE]

dt.wide = dcast(dt, from_geneID + to_geneID ~ source, value.var = "present", fill = FALSE)

dt.wide = dt.wide[order(dt.wide$from_geneID, dt.wide$to_geneID), ]

4.3 Upset plot

if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) {
  source_cols = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  source_cols = c("MCScanX", 'RBH', "FastOMA")
}


dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]

hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))

dff = as.data.frame(dt.wide)

upset_plot = upset(
  dff,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods") +
  theme(legend.position = "none")

# Print or/and save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf") # change name

4.4 Gene occurence

# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)

ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)), 
        grep('from_count', colnames(dt.wide)),
        grep('to_count', colnames(dt.wide)), 
        grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]

4.5 In/out PSS

df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)

df = merge(df, gn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) # 
df = merge(df, sn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) # 

df = merge(df, pss_long, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)

nin = pss_long[which(!(pss_long$id %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
nin = merge(nin, gn, by.x = 'id', by.y = 'V1', all.x = TRUE)
nin = merge(nin, sn, by.x = 'id', by.y = 'V1', all.x = TRUE)

openxlsx::write.xlsx(nin, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-15.xlsx'), 
                     asTable = TRUE) # change name

4.6 fruitTrees plant gmm

fp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]

colnames(combined)[2:4] = paste('fruitTrees', colnames(combined)[2:4], sep = '_')

colnames(df)
##  [1] "from_geneID"     "to_geneID"       "FastOMA"         "MCScanX"        
##  [5] "OrthoDB"         "PLAZA"           "RBH"             "compara"        
##  [9] "from_count"      "to_count"        "count_evidence"  "ath_BINCODE"    
## [13] "ath_NAME"        "ath_DESCRIPTION" "athName"         "athSynonims"    
## [17] "all_pathways"    "short_name"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$fruitTrees_BINCODE))
## 
## FALSE  TRUE 
## 83343   980
dt[is.na(dt$fruitTrees_BINCODE), ]$to_geneID # check ones with strange ID
##   [1] "PRAM_106208"      "PRAM_106208"      "PRAM_106208"     
##   [4] "PRAM_106208.1"    "PRAM_106208.1"    "PRAM_106208.1"   
##   [7] "PRAM_106208.1"    "PRAM_106208.1"    "PRAM_106382.1"   
##  [10] "PRAM_116774"      "PRAM_116774"      "PRAM_121333"     
##  [13] "PRAM_121333"      "PRAM_121333"      "PRAM_121333"     
##  [16] "PRAM_12783"       "PRAM_12783"       "PRAM_12783"      
##  [19] "PRAM_12783"       "PRAM_128172"      "PRAM_128172"     
##  [22] "PRAM_128172"      "PRAM_128172"      "PRAM_128172"     
##  [25] "PRAM_139928.1"    "PRAM_139928.1"    "PRAM_139928.1.p1"
##  [28] "PRAM_139928.1.p1" "PRAM_140960.1.p1" "PRAM_141672.1.p1"
##  [31] "PRAM_141691"      "PRAM_141691"      "PRAM_141691"     
##  [34] "PRAM_141691"      "PRAM_141691"      "PRAM_141691"     
##  [37] "PRAM_141691.1"    "PRAM_141691.1"    "PRAM_141691.1"   
##  [40] "PRAM_141691.1"    "PRAM_141691.1"    "PRAM_141691.1.p1"
##  [43] "PRAM_141691.1.p1" "PRAM_141691.1.p1" "PRAM_141691.1.p1"
##  [46] "PRAM_141691.1.p1" "PRAM_141691.1.p1" "PRAM_141691.1.p1"
##  [49] "PRAM_141869"      "PRAM_141869"      "PRAM_141869.1"   
##  [52] "PRAM_141869.1"    "PRAM_141870"      "PRAM_141870"     
##  [55] "PRAM_141870"      "PRAM_141870"      "PRAM_141870"     
##  [58] "PRAM_141870"      "PRAM_141870"      "PRAM_141870"     
##  [61] "PRAM_141870"      "PRAM_141870"      "PRAM_141870"     
##  [64] "PRAM_141870.1"    "PRAM_141870.1"    "PRAM_141870.1"   
##  [67] "PRAM_141870.1"    "PRAM_141870.1"    "PRAM_141870.1.p1"
##  [70] "PRAM_141870.1.p1" "PRAM_141870.1.p1" "PRAM_141870.1.p1"
##  [73] "PRAM_141870.1.p1" "PRAM_141872"      "PRAM_141872.1"   
##  [76] "PRAM_141872.1.p1" "PRAM_141875.1"    "PRAM_141875.1"   
##  [79] "PRAM_141875.1"    "PRAM_141875.1"    "PRAM_141875.1"   
##  [82] "PRAM_141875.1"    "PRAM_141875.1"    "PRAM_141876.1"   
##  [85] "PRAM_141876.1"    "PRAM_141877.1"    "PRAM_141877.1.p2"
##  [88] "PRAM_15610"       "PRAM_15610.1"     "PRAM_15610.1.p1" 
##  [91] "PRAM_15610.1.p1"  "PRAM_15612.1"     "PRAM_15612.1.p1" 
##  [94] "PRAM_15614"       "PRAM_15614.1.p1"  "PRAM_15617"      
##  [97] "PRAM_15617.1"     "PRAM_15617.1.p1"  "PRAM_15671"      
## [100] "PRAM_15671.1"     "PRAM_15671.1.p3"  "PRAM_15687.1.p1" 
## [103] "PRAM_15687.1.p1"  "PRAM_15687.1.p1"  "PRAM_15708.1.p1" 
## [106] "PRAM_15708.1.p1"  "PRAM_15708.1.p1"  "PRAM_15758"      
## [109] "PRAM_15758.1"     "PRAM_15758.1"     "PRAM_15758.1"    
## [112] "PRAM_15758.1"     "PRAM_15758.1"     "PRAM_15758.1.p1" 
## [115] "PRAM_15758.1.p1"  "PRAM_15763"       "PRAM_15763.1"    
## [118] "PRAM_15773"       "PRAM_15773"       "PRAM_15773.1"    
## [121] "PRAM_15773.1.p1"  "PRAM_15781.1"     "PRAM_15781.1"    
## [124] "PRAM_15781.1"     "PRAM_15781.1"     "PRAM_15781.1"    
## [127] "PRAM_15781.1"     "PRAM_15781.1.p1"  "PRAM_15821.1"    
## [130] "PRAM_15848"       "PRAM_15848"       "PRAM_15848"      
## [133] "PRAM_15848"       "PRAM_15848.1"     "PRAM_15848.1"    
## [136] "PRAM_15848.1"     "PRAM_15848.1"     "PRAM_15848.1"    
## [139] "PRAM_15848.1"     "PRAM_15848.1"     "PRAM_15848.1"    
## [142] "PRAM_15848.1.p1"  "PRAM_15848.1.p1"  "PRAM_15856"      
## [145] "PRAM_15856"       "PRAM_15856"       "PRAM_15856"      
## [148] "PRAM_15856"       "PRAM_15856"       "PRAM_15856"      
## [151] "PRAM_15856"       "PRAM_15856"       "PRAM_15856"      
## [154] "PRAM_15856"       "PRAM_15856"       "PRAM_15856.1"    
## [157] "PRAM_15856.1"     "PRAM_15856.1"     "PRAM_15856.1"    
## [160] "PRAM_15856.1"     "PRAM_15856.1"     "PRAM_15856.1.p1" 
## [163] "PRAM_16203"       "PRAM_16203"       "PRAM_16203"      
## [166] "PRAM_16203"       "PRAM_21846"       "PRAM_21846.1"    
## [169] "PRAM_21846.1"     "PRAM_21846.1"     "PRAM_21846.1"    
## [172] "PRAM_21846.1.p1"  "PRAM_21846.1.p1"  "PRAM_21846.1.p1" 
## [175] "PRAM_21846.1.p1"  "PRAM_21846.1.p1"  "PRAM_21846.1.p1" 
## [178] "PRAM_21934.1"     "PRAM_21934.1"     "PRAM_21934.1.p1" 
## [181] "PRAM_21934.1.p1"  "PRAM_22800"       "PRAM_22800"      
## [184] "PRAM_22800"       "PRAM_22800"       "PRAM_22800"      
## [187] "PRAM_22800"       "PRAM_22800"       "PRAM_22800.1"    
## [190] "PRAM_22800.1"     "PRAM_22800.1.p1"  "PRAM_22800.1.p1" 
## [193] "PRAM_23091"       "PRAM_23722"       "PRAM_23722"      
## [196] "PRAM_23722"       "PRAM_23722"       "PRAM_23722"      
## [199] "PRAM_23722"       "PRAM_23722"       "PRAM_23722"      
## [202] "PRAM_23722"       "PRAM_23722.1"     "PRAM_23722.1"    
## [205] "PRAM_23722.1"     "PRAM_23722.1"     "PRAM_23722.1.p1" 
## [208] "PRAM_23722.1.p1"  "PRAM_23722.1.p1"  "PRAM_23722.1.p1" 
## [211] "PRAM_24043.1"     "PRAM_24043.1"     "PRAM_24043.1"    
## [214] "PRAM_24077"       "PRAM_24077"       "PRAM_24077"      
## [217] "PRAM_24077.1"     "PRAM_24077.1"     "PRAM_24077.1"    
## [220] "PRAM_24077.1.p1"  "PRAM_24077.1.p1"  "PRAM_24077.1.p1" 
## [223] "PRAM_24440.1.p1"  "PRAM_24440.1.p1"  "PRAM_24440.1.p1" 
## [226] "PRAM_24440.1.p1"  "PRAM_24440.1.p1"  "PRAM_25343"      
## [229] "PRAM_25343.1"     "PRAM_25343.1"     "PRAM_25343.1.p1" 
## [232] "PRAM_25343.1.p1"  "PRAM_25466"       "PRAM_25466"      
## [235] "PRAM_25466"       "PRAM_25466"       "PRAM_25466.1"    
## [238] "PRAM_25466.1"     "PRAM_25466.1.p1"  "PRAM_25466.1.p1" 
## [241] "PRAM_25482"       "PRAM_25482"       "PRAM_25482"      
## [244] "PRAM_25482"       "PRAM_25482"       "PRAM_25482"      
## [247] "PRAM_25482"       "PRAM_25601"       "PRAM_25601"      
## [250] "PRAM_25601"       "PRAM_25601"       "PRAM_25601"      
## [253] "PRAM_25601"       "PRAM_25601"       "PRAM_25601"      
## [256] "PRAM_25601.1"     "PRAM_25601.1"     "PRAM_25601.1.p1" 
## [259] "PRAM_25757"       "PRAM_25757.1"     "PRAM_25757.1.p1" 
## [262] "PRAM_25806.1"     "PRAM_25806.1"     "PRAM_25806.1"    
## [265] "PRAM_25806.1.p1"  "PRAM_25806.1.p1"  "PRAM_25806.1.p1" 
## [268] "PRAM_25806.1.p1"  "PRAM_25806.1.p1"  "PRAM_25847"      
## [271] "PRAM_25847.1"     "PRAM_25847.1.p1"  "PRAM_25853.1"    
## [274] "PRAM_25853.1"     "PRAM_25853.1"     "PRAM_25853.1.p1" 
## [277] "PRAM_25853.1.p1"  "PRAM_25853.1.p1"  "PRAM_25853.1.p1" 
## [280] "PRAM_25855.1"     "PRAM_25855.1.p1"  "PRAM_25858"      
## [283] "PRAM_25858"       "PRAM_25858.1"     "PRAM_25858.1"    
## [286] "PRAM_25858.1.p1"  "PRAM_25858.1.p1"  "PRAM_25864"      
## [289] "PRAM_25864"       "PRAM_25864.1"     "PRAM_25864.1"    
## [292] "PRAM_25864.1.p1"  "PRAM_25864.1.p1"  "PRAM_25866"      
## [295] "PRAM_25866"       "PRAM_25866.1"     "PRAM_25866.1"    
## [298] "PRAM_25866.1.p1"  "PRAM_25866.1.p1"  "PRAM_25885"      
## [301] "PRAM_25885"       "PRAM_25885.1"     "PRAM_25885.1"    
## [304] "PRAM_25885.1.p1"  "PRAM_25885.1.p1"  "PRAM_25896.1"    
## [307] "PRAM_25896.1"     "PRAM_25896.1"     "PRAM_25896.1"    
## [310] "PRAM_25896.1"     "PRAM_25896.1"     "PRAM_25900"      
## [313] "PRAM_25900.1"     "PRAM_25900.1.p1"  "PRAM_25902"      
## [316] "PRAM_25902.1"     "PRAM_25902.1.p1"  "PRAM_25906.1"    
## [319] "PRAM_25906.1"     "PRAM_25906.1"     "PRAM_25906.1"    
## [322] "PRAM_25906.1"     "PRAM_25906.1"     "PRAM_25906.1"    
## [325] "PRAM_25906.1"     "PRAM_25906.1"     "PRAM_25906.1"    
## [328] "PRAM_25906.1"     "PRAM_25906.1"     "PRAM_25906.1"    
## [331] "PRAM_25911"       "PRAM_25911.1"     "PRAM_25911.1"    
## [334] "PRAM_25911.1.p1"  "PRAM_25921"       "PRAM_25921"      
## [337] "PRAM_25921"       "PRAM_25921"       "PRAM_25921"      
## [340] "PRAM_25921"       "PRAM_25921"       "PRAM_25921"      
## [343] "PRAM_25921"       "PRAM_25921"       "PRAM_25921"      
## [346] "PRAM_25921"       "PRAM_25921"       "PRAM_25921"      
## [349] "PRAM_25921"       "PRAM_25921"       "PRAM_25921"      
## [352] "PRAM_25921"       "PRAM_25921"       "PRAM_25921"      
## [355] "PRAM_25921"       "PRAM_25921"       "PRAM_25921"      
## [358] "PRAM_25921"       "PRAM_25921"       "PRAM_25921"      
## [361] "PRAM_25921"       "PRAM_25921"       "PRAM_25921"      
## [364] "PRAM_25921"       "PRAM_25921"       "PRAM_25921"      
## [367] "PRAM_25921"       "PRAM_25921"       "PRAM_25921"      
## [370] "PRAM_25921"       "PRAM_25921"       "PRAM_25921"      
## [373] "PRAM_25921"       "PRAM_25921"       "PRAM_25921"      
## [376] "PRAM_25921"       "PRAM_25921"       "PRAM_25921"      
## [379] "PRAM_25921"       "PRAM_25921"       "PRAM_25921"      
## [382] "PRAM_25921"       "PRAM_25921"       "PRAM_25921"      
## [385] "PRAM_25921"       "PRAM_25921"       "PRAM_25921"      
## [388] "PRAM_25921"       "PRAM_25921"       "PRAM_25921"      
## [391] "PRAM_25921"       "PRAM_25921"       "PRAM_25921.1"    
## [394] "PRAM_25921.1"     "PRAM_25921.1.p1"  "PRAM_25921.1.p1" 
## [397] "PRAM_25923.1"     "PRAM_25923.1"     "PRAM_25923.1.p1" 
## [400] "PRAM_25923.1.p1"  "PRAM_25927"       "PRAM_25927.1"    
## [403] "PRAM_25927.1.p1"  "PRAM_25933"       "PRAM_25933.1"    
## [406] "PRAM_25933.1.p1"  "PRAM_25935"       "PRAM_25935.1"    
## [409] "PRAM_25935.1.p1"  "PRAM_25937"       "PRAM_25937"      
## [412] "PRAM_25937"       "PRAM_25937"       "PRAM_25937"      
## [415] "PRAM_25937"       "PRAM_25937"       "PRAM_25937"      
## [418] "PRAM_25937.1.p1"  "PRAM_25937.1.p1"  "PRAM_25937.1.p1" 
## [421] "PRAM_25937.1.p1"  "PRAM_25939"       "PRAM_25939.1"    
## [424] "PRAM_25939.1.p1"  "PRAM_25945"       "PRAM_25945.1"    
## [427] "PRAM_25945.1.p1"  "PRAM_25951"       "PRAM_25951"      
## [430] "PRAM_25951"       "PRAM_25951.1"     "PRAM_25951.1"    
## [433] "PRAM_25951.1"     "PRAM_25951.1.p1"  "PRAM_25951.1.p1" 
## [436] "PRAM_25951.1.p1"  "PRAM_25994"       "PRAM_25994.1"    
## [439] "PRAM_25994.1.p1"  "PRAM_25996"       "PRAM_25996"      
## [442] "PRAM_25996"       "PRAM_25996"       "PRAM_25996"      
## [445] "PRAM_25996"       "PRAM_25996"       "PRAM_25996"      
## [448] "PRAM_25996"       "PRAM_25996"       "PRAM_25996"      
## [451] "PRAM_25996"       "PRAM_25996"       "PRAM_25996"      
## [454] "PRAM_25996"       "PRAM_25996"       "PRAM_25996"      
## [457] "PRAM_25996"       "PRAM_25996"       "PRAM_25996"      
## [460] "PRAM_25996"       "PRAM_25996"       "PRAM_25996"      
## [463] "PRAM_25996"       "PRAM_25996"       "PRAM_25996"      
## [466] "PRAM_25996"       "PRAM_25996.1"     "PRAM_25996.1"    
## [469] "PRAM_25996.1"     "PRAM_25996.1"     "PRAM_25996.1"    
## [472] "PRAM_25996.1"     "PRAM_25996.1"     "PRAM_25996.1"    
## [475] "PRAM_26005.1"     "PRAM_26005.1"     "PRAM_26007"      
## [478] "PRAM_26007.1"     "PRAM_26007.1.p1"  "PRAM_26016"      
## [481] "PRAM_26016.1"     "PRAM_26016.1.p1"  "PRAM_26019"      
## [484] "PRAM_26019"       "PRAM_26019"       "PRAM_26019"      
## [487] "PRAM_26019"       "PRAM_26019.1"     "PRAM_26019.1.p1" 
## [490] "PRAM_26020"       "PRAM_26020.1"     "PRAM_26020.1.p1" 
## [493] "PRAM_26068"       "PRAM_26068.1"     "PRAM_26068.1.p1" 
## [496] "PRAM_26082.1"     "PRAM_26082.1"     "PRAM_26082.1.p1" 
## [499] "PRAM_26082.1.p1"  "PRAM_26097"       "PRAM_26097"      
## [502] "PRAM_26097.1"     "PRAM_26097.1.p1"  "PRAM_26097.1.p1" 
## [505] "PRAM_26101.1"     "PRAM_26105"       "PRAM_26105.1"    
## [508] "PRAM_26105.1.p1"  "PRAM_26108"       "PRAM_26108"      
## [511] "PRAM_26108"       "PRAM_26108.1"     "PRAM_26108.1.p1" 
## [514] "PRAM_26108.1.p1"  "PRAM_26108.1.p1"  "PRAM_26108.1.p1" 
## [517] "PRAM_26109"       "PRAM_26109.1"     "PRAM_26109.1"    
## [520] "PRAM_26109.1.p1"  "PRAM_26109.1.p1"  "PRAM_26109.1.p1" 
## [523] "PRAM_26113"       "PRAM_26113"       "PRAM_26113"      
## [526] "PRAM_26113"       "PRAM_26113"       "PRAM_26113"      
## [529] "PRAM_26113"       "PRAM_26113"       "PRAM_26113.1"    
## [532] "PRAM_26113.1"     "PRAM_26113.1"     "PRAM_26113.1"    
## [535] "PRAM_26113.1"     "PRAM_26128"       "PRAM_26128.1"    
## [538] "PRAM_26128.1.p1"  "PRAM_26129"       "PRAM_26129.1"    
## [541] "PRAM_26129.1"     "PRAM_26129.1.p1"  "PRAM_26132"      
## [544] "PRAM_26132"       "PRAM_26132.1"     "PRAM_26132.1"    
## [547] "PRAM_26132.1.p1"  "PRAM_26132.1.p1"  "PRAM_26139"      
## [550] "PRAM_26139"       "PRAM_26139"       "PRAM_26139.1"    
## [553] "PRAM_26139.1"     "PRAM_26139.1"     "PRAM_26139.1"    
## [556] "PRAM_26139.1.p1"  "PRAM_26139.1.p1"  "PRAM_26139.1.p1" 
## [559] "PRAM_26139.1.p1"  "PRAM_26139.1.p1"  "PRAM_26145"      
## [562] "PRAM_26145"       "PRAM_26145.1"     "PRAM_26145.1"    
## [565] "PRAM_26145.1.p1"  "PRAM_26184"       "PRAM_26184.1"    
## [568] "PRAM_26184.1.p1"  "PRAM_26189"       "PRAM_26189"      
## [571] "PRAM_26189"       "PRAM_26189"       "PRAM_26189.1"    
## [574] "PRAM_26189.1"     "PRAM_26189.1"     "PRAM_26189.1"    
## [577] "PRAM_26189.1"     "PRAM_26189.1.p1"  "PRAM_26195.1"    
## [580] "PRAM_26195.1"     "PRAM_26195.1.p1"  "PRAM_26195.1.p1" 
## [583] "PRAM_26207"       "PRAM_26207"       "PRAM_26207.1"    
## [586] "PRAM_26207.1"     "PRAM_26207.1"     "PRAM_26207.1"    
## [589] "PRAM_26207.1.p1"  "PRAM_26211"       "PRAM_26211.1"    
## [592] "PRAM_26211.1.p1"  "PRAM_26212"       "PRAM_26212.1"    
## [595] "PRAM_26212.1.p1"  "PRAM_26216"       "PRAM_26216"      
## [598] "PRAM_26216"       "PRAM_26216"       "PRAM_26216.1"    
## [601] "PRAM_26216.1"     "PRAM_26216.1"     "PRAM_26216.1"    
## [604] "PRAM_26216.1.p1"  "PRAM_26216.1.p1"  "PRAM_26216.1.p1" 
## [607] "PRAM_26216.1.p1"  "PRAM_26218"       "PRAM_26218.1"    
## [610] "PRAM_26218.1.p1"  "PRAM_26220"       "PRAM_26220.1"    
## [613] "PRAM_26220.1.p1"  "PRAM_26242"       "PRAM_26242"      
## [616] "PRAM_26242.1"     "PRAM_26242.1"     "PRAM_26242.1"    
## [619] "PRAM_26242.1.p1"  "PRAM_26242.1.p1"  "PRAM_26243.1"    
## [622] "PRAM_26243.1.p1"  "PRAM_26245"       "PRAM_26245"      
## [625] "PRAM_26245"       "PRAM_26245"       "PRAM_26245.1"    
## [628] "PRAM_26245.1"     "PRAM_26245.1"     "PRAM_26245.1"    
## [631] "PRAM_26245.1"     "PRAM_26245.1"     "PRAM_26245.1"    
## [634] "PRAM_26245.1"     "PRAM_26245.1"     "PRAM_26245.1"    
## [637] "PRAM_26245.1"     "PRAM_26245.1"     "PRAM_26245.1"    
## [640] "PRAM_26245.1"     "PRAM_26247"       "PRAM_26247.1"    
## [643] "PRAM_26247.1.p1"  "PRAM_26248"       "PRAM_26248"      
## [646] "PRAM_26248"       "PRAM_26248"       "PRAM_26248.1"    
## [649] "PRAM_26248.1"     "PRAM_26248.1"     "PRAM_26248.1"    
## [652] "PRAM_26248.1.p1"  "PRAM_26248.1.p1"  "PRAM_26248.1.p1" 
## [655] "PRAM_26248.1.p1"  "PRAM_26252"       "PRAM_26252.1"    
## [658] "PRAM_26252.1.p1"  "PRAM_26255"       "PRAM_26255.1"    
## [661] "PRAM_26255.1"     "PRAM_26255.1"     "PRAM_26255.1"    
## [664] "PRAM_26255.1"     "PRAM_26255.1.p1"  "PRAM_26257"      
## [667] "PRAM_26257.1"     "PRAM_26257.1"     "PRAM_26257.1"    
## [670] "PRAM_26257.1.p1"  "PRAM_26262"       "PRAM_26262"      
## [673] "PRAM_26262.1"     "PRAM_26262.1"     "PRAM_26262.1"    
## [676] "PRAM_26262.1.p1"  "PRAM_26262.1.p1"  "PRAM_26263.1"    
## [679] "PRAM_26263.1"     "PRAM_26263.1"     "PRAM_26263.1.p1" 
## [682] "PRAM_26263.1.p1"  "PRAM_26268.1"     "PRAM_26268.1"    
## [685] "PRAM_26268.1"     "PRAM_26273"       "PRAM_26273"      
## [688] "PRAM_26273"       "PRAM_26273"       "PRAM_26273.1"    
## [691] "PRAM_26273.1"     "PRAM_26273.1"     "PRAM_26273.1"    
## [694] "PRAM_26276"       "PRAM_26276.1"     "PRAM_26276.1.p1" 
## [697] "PRAM_26279"       "PRAM_26279"       "PRAM_26279.1"    
## [700] "PRAM_26279.1"     "PRAM_26279.1"     "PRAM_26279.1.p1" 
## [703] "PRAM_26279.1.p1"  "PRAM_26279.1.p1"  "PRAM_26280"      
## [706] "PRAM_26280"       "PRAM_26280.1"     "PRAM_26280.1"    
## [709] "PRAM_26280.1.p1"  "PRAM_26280.1.p1"  "PRAM_26281"      
## [712] "PRAM_26281"       "PRAM_26281.1"     "PRAM_26281.1"    
## [715] "PRAM_26281.1"     "PRAM_26281.1"     "PRAM_26281.1"    
## [718] "PRAM_26281.1.p1"  "PRAM_26281.1.p1"  "PRAM_26296"      
## [721] "PRAM_26296.1"     "PRAM_26296.1.p2"  "PRAM_26298"      
## [724] "PRAM_26298.1"     "PRAM_26301"       "PRAM_26301.1"    
## [727] "PRAM_26301.1.p1"  "PRAM_26302"       "PRAM_26302.1"    
## [730] "PRAM_26302.1.p1"  "PRAM_26307"       "PRAM_26307"      
## [733] "PRAM_26307"       "PRAM_26307.1"     "PRAM_26307.1"    
## [736] "PRAM_26307.1.p1"  "PRAM_26312"       "PRAM_26312.1"    
## [739] "PRAM_26312.1"     "PRAM_26312.1.p1"  "PRAM_26314"      
## [742] "PRAM_26314"       "PRAM_26314.1"     "PRAM_26314.1"    
## [745] "PRAM_26314.1.p1"  "PRAM_26314.1.p1"  "PRAM_26322"      
## [748] "PRAM_26322.1"     "PRAM_26322.1.p1"  "PRAM_26322.1.p1" 
## [751] "PRAM_26322.1.p1"  "PRAM_26326"       "PRAM_26326.1"    
## [754] "PRAM_26326.1.p1"  "PRAM_26326.1.p1"  "PRAM_26326.1.p1" 
## [757] "PRAM_26328"       "PRAM_26328"       "PRAM_26328.1"    
## [760] "PRAM_26328.1"     "PRAM_26331"       "PRAM_26331.1"    
## [763] "PRAM_26331.1"     "PRAM_26334"       "PRAM_26334.1"    
## [766] "PRAM_26334.1"     "PRAM_26363"       "PRAM_26363.1"    
## [769] "PRAM_26363.1.p3"  "PRAM_267"         "PRAM_267"        
## [772] "PRAM_267.1"       "PRAM_267.1"       "PRAM_267.1"      
## [775] "PRAM_267.1"       "PRAM_267.1.p1"    "PRAM_267.1.p1"   
## [778] "PRAM_272"         "PRAM_272.1"       "PRAM_272.1.p1"   
## [781] "PRAM_274.1"       "PRAM_281"         "PRAM_281.1"      
## [784] "PRAM_283"         "PRAM_283.1"       "PRAM_36897"      
## [787] "PRAM_36897.1"     "PRAM_36897.1.p1"  "PRAM_36916"      
## [790] "PRAM_36916"       "PRAM_36916.1"     "PRAM_36916.1"    
## [793] "PRAM_36916.1.p1"  "PRAM_36916.1.p1"  "PRAM_36921"      
## [796] "PRAM_36921.1"     "PRAM_36921.1"     "PRAM_36921.1.p1" 
## [799] "PRAM_36926"       "PRAM_36926"       "PRAM_36926"      
## [802] "PRAM_36926"       "PRAM_36926"       "PRAM_36926.1"    
## [805] "PRAM_36926.1"     "PRAM_36926.1"     "PRAM_36926.1"    
## [808] "PRAM_36938"       "PRAM_36938.1"     "PRAM_36938.1"    
## [811] "PRAM_36938.1"     "PRAM_36938.1.p1"  "PRAM_36938.1.p1" 
## [814] "PRAM_36938.1.p1"  "PRAM_36967"       "PRAM_36967"      
## [817] "PRAM_36967"       "PRAM_36967"       "PRAM_36967.1"    
## [820] "PRAM_36967.1"     "PRAM_36967.1"     "PRAM_36967.1"    
## [823] "PRAM_36967.1.p2"  "PRAM_36976"       "PRAM_36976"      
## [826] "PRAM_36976"       "PRAM_36976.1"     "PRAM_36976.1"    
## [829] "PRAM_36976.1"     "PRAM_36976.1"     "PRAM_36976.1"    
## [832] "PRAM_36976.1"     "PRAM_36976.1.p1"  "PRAM_36976.1.p1" 
## [835] "PRAM_36976.1.p1"  "PRAM_36977.1"     "PRAM_36977.1"    
## [838] "PRAM_36977.1"     "PRAM_36977.1"     "PRAM_36977.1"    
## [841] "PRAM_36997"       "PRAM_36997"       "PRAM_36997"      
## [844] "PRAM_36997.1"     "PRAM_36997.1"     "PRAM_36997.1"    
## [847] "PRAM_36997.1.p1"  "PRAM_36997.1.p1"  "PRAM_36997.1.p1" 
## [850] "PRAM_37001"       "PRAM_37001"       "PRAM_37001.1"    
## [853] "PRAM_37001.1"     "PRAM_37001.1.p1"  "PRAM_37001.1.p1" 
## [856] "PRAM_37004"       "PRAM_37004.1"     "PRAM_37004.1.p1" 
## [859] "PRAM_37006"       "PRAM_37006.1"     "PRAM_37006.1.p1" 
## [862] "PRAM_37008"       "PRAM_37008"       "PRAM_37008"      
## [865] "PRAM_37008.1"     "PRAM_37008.1"     "PRAM_37008.1"    
## [868] "PRAM_37008.1.p1"  "PRAM_37011"       "PRAM_37011"      
## [871] "PRAM_37011.1"     "PRAM_37011.1"     "PRAM_37011.1.p1" 
## [874] "PRAM_37011.1.p1"  "PRAM_43769.1"     "PRAM_43769.1"    
## [877] "PRAM_43769.1.p1"  "PRAM_448.1"       "PRAM_448.1"      
## [880] "PRAM_448.1"       "PRAM_448.1"       "PRAM_448.1"      
## [883] "PRAM_448.1"       "PRAM_448.1"       "PRAM_448.1"      
## [886] "PRAM_448.1"       "PRAM_448.1"       "PRAM_448.1"      
## [889] "PRAM_448.1"       "PRAM_448.1"       "PRAM_448.1"      
## [892] "PRAM_448.1"       "PRAM_448.1"       "PRAM_448.1"      
## [895] "PRAM_448.1"       "PRAM_448.1"       "PRAM_448.1"      
## [898] "PRAM_448.1"       "PRAM_46974"       "PRAM_46974"      
## [901] "PRAM_46974"       "PRAM_46974"       "PRAM_46974"      
## [904] "PRAM_46974"       "PRAM_46974.1"     "PRAM_46974.1"    
## [907] "PRAM_46974.1"     "PRAM_46974.1"     "PRAM_46974.1"    
## [910] "PRAM_46974.1"     "PRAM_46974.1"     "PRAM_46974.1"    
## [913] "PRAM_46974.1"     "PRAM_46974.1"     "PRAM_52553.1"    
## [916] "PRAM_52553.1"     "PRAM_52553.1.p1"  "PRAM_52553.1.p1" 
## [919] "PRAM_66596.1.p1"  "PRAM_66596.1.p1"  "PRAM_66596.1.p1" 
## [922] "PRAM_67540.1"     "PRAM_67549"       "PRAM_67549.1"    
## [925] "PRAM_67549.1"     "PRAM_67549.1.p1"  "PRAM_67549.1.p1" 
## [928] "PRAM_73094"       "PRAM_73094"       "PRAM_73094"      
## [931] "PRAM_73094"       "PRAM_73094"       "PRAM_73094"      
## [934] "PRAM_73094"       "PRAM_758"         "PRAM_758.1"      
## [937] "PRAM_758.1.p1"    "PRAM_80941"       "PRAM_80941"      
## [940] "PRAM_80941"       "PRAM_80941"       "PRAM_80941"      
## [943] "PRAM_80941"       "PRAM_80941"       "PRAM_80941"      
## [946] "PRAM_80941"       "PRAM_80941"       "PRAM_80941"      
## [949] "PRAM_80941"       "PRAM_80941"       "PRAM_80941"      
## [952] "PRAM_80941"       "PRAM_80941"       "PRAM_80941"      
## [955] "PRAM_80941"       "PRAM_80941"       "PRAM_80941"      
## [958] "PRAM_80941.1"     "PRAM_80941.1"     "PRAM_80941.1.p1" 
## [961] "PRAM_80945"       "PRAM_80945.1"     "PRAM_80945.1"    
## [964] "PRAM_80945.1.p1"  "PRAM_8262"        "PRAM_8262"       
## [967] "PRAM_8262"        "PRAM_85890"       "PRAM_85890"      
## [970] "PRAM_85890"       "PRAM_85890"       "PRAM_888"        
## [973] "PRAM_888"         "PRAM_888"         "PRAM_888"        
## [976] "PRAM_888"         "PRAM_888"         "PRAM_888.1"      
## [979] "PRAM_891.1"       "PRAM_95186.1.p1"
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]
rm(list = setdiff(ls(), c("df", 
                          "ath.gmm", "gn", "sn", "pss_long",  
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut", 
                          "pattern_in", 
                          "pattern_out", 
                          "mercator", 
                          "mercatorPatternIn1", 
                          "mercatorPatternOut1", 
                          "mercatorPatternIn2", 
                          "mercatorPatternOut2",
                          "flag")))


gc()
##             used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells   6802958 363.4   11303600  603.7  14129500  754.6
## Vcells 101713704 776.1  189394518 1445.0 189394518 1445.0

4.7 Translation table

MapMan Mercator matches: first three levels only

df = df[!duplicated(df), ]


compare_bin <- function(athMercator, plantXMercator) {
  # split string by | then by ; and trim tokens,
  # then truncate each token to first three dot-separated levels
  split_tokens = function(code) {
    if(is.na(code) || code == "") return(character(0))
    parts = stringr::str_split(code, "\\|", simplify = TRUE)
    tokens = unlist(lapply(parts, function(p) stringr::str_split(p, ";", simplify = TRUE)))
    tokens = unique(stringr::str_trim(tokens))
    
    # For each token, extract first 3 dot levels
    trunc3levels = function(token) {
      levels = unlist(stringr::str_split(token, "\\."))
      if(length(levels) > 3) {
        levels = levels[1:3]
      }
      paste(levels, collapse = ".")
    }
    
    truncated_tokens = sapply(tokens, trunc3levels)
    unique(truncated_tokens)
  }
  
  bin_set = split_tokens(athMercator)
  v4_set = split_tokens(plantXMercator)
  
  # Tokens that are common between sets truncated to 3 levels
  common_tokens = intersect(bin_set, v4_set)
  
  # Check if plantXMercator is exact duplication of athMercator token(s) (all plantXMercator tokens equal truncated bin_set token(s))
  v4_parts = stringr::str_split(plantXMercator, "\\|", simplify = TRUE)
  if(length(bin_set) == 1 &&
     length(v4_parts) > 1 &&
     all(split_tokens(plantXMercator) == bin_set)) {
    return(paste0("100% match based on ", bin_set))
  }
  
  # Check if sets are identical
  if(setequal(bin_set, v4_set)) {
    return(paste0("100% match based on ", paste(bin_set, collapse = ", ")))
  }
  
  # Partial match if any tokens overlap, mention those tokens
  if(length(common_tokens) > 0) {
    return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
  }
  
  return("no match")
}



df = df %>%
  dplyr::rowwise() %>%
  dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, fruitTrees_BINCODE)) %>% # change name 
  dplyr::ungroup()

4.8 Filter

# now

cat('####  ####  before filter ####  ####  \n')
## ####  ####  before filter ####  ####
length(unique(df$from_geneID))
## [1] 22707
length(unique(df$to_geneID))
## [1] 23599
range(df$from_count)
## [1]  1 55
range(df$to_count)
## [1]  1 58
length(unique(df$from_geneID[df$from_count > 30]))
## [1] 204
length(unique(df$to_geneID[df$to_count > 30]))
## [1] 197
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
covered_genes = character()


if (flag == 1) {
  methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) {  # make flags
  methods = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
  methods = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  methods = c("MCScanX", 'RBH', "FastOMA")
}


match_categories = c("no match", "100% match based", "partial match")

long_dt = data.table::rbindlist(lapply(methods, function(method) {
  dt[, .(
    Method = method,
    Match_Type = c("no match", "100% match based", "partial match"),
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
    )
  )]
}), use.names = TRUE)

long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]

ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match        no match partial match  
##          64476          13068           6779
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  no match partial match 
##   1       30812    11042           5277
##   2       10000     1238            874
##   3        7084      437            281
##   4        6757      209            159
##   5        6640      108            132
##   6        3183       34             56
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))

tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))

ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Frequency of count_evidence by MapMan4_Match",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")




if (flag != 4) {
  special_methods = c("OrthoDB", "RBH", "FastOMA")
} else {
  special_methods = c("RBH", "FastOMA")
}

# Initialize a named vector to count method_MapMan4 assignments
mapman4_counts = setNames(rep(0, length(special_methods)), paste0(special_methods, "_MapMan4"))

for (method in methods) {

  base_cond = dt$filter_criteria == "reject" & dt[[method]] == TRUE & 
               !(dt$to_geneID %in% covered_genes) & !(dt$from_geneID %in% covered_genes)
  add_cond = rep(TRUE, nrow(dt))
  
  if (method %in% special_methods) {
    add_cond = rep(TRUE, nrow(dt))
  }
  
  candidates = which(base_cond & add_cond)
  
  if (length(candidates) > 0) {
    if (method %in% special_methods) {
      for (i in candidates) {
        row = dt[i]
        covered_by = special_methods[sapply(special_methods, function(m) row[[m]] == TRUE)]
        count_covered = length(covered_by)
        
        is_candidate = FALSE
        new_criteria = NULL
        
        if (count_covered == 3) {
          is_candidate = TRUE
          new_criteria = "OrthoDB_FastOMA_RBH"
        } else if (count_covered == 2) {
          is_candidate = TRUE
          new_criteria = paste(sort(covered_by), collapse = "_")
        } else if (count_covered == 1) {
          # Check MapMan4_Match string contains "match based on" and method name (case-insensitive)
          # reconsider
          # (grepl("match based on", mapman_val, ignore.case = TRUE) &&
          #   !grepl("^100% match based on 35\\.2$", mapman_val)) # for flags 3
          if (grepl("match based on", row$MapMan4_Match, ignore.case = TRUE)) {
            is_candidate = TRUE
            new_criteria = paste0(method, "_MapMan4")
            
            # Increment count for this mapman4 assignment
            mapman4_counts[[new_criteria]] = mapman4_counts[[new_criteria]] + 1
          }
        }
        
        if (is_candidate) {
          dt[i, filter_criteria := new_criteria]
          # covered_genes = unique(c(covered_genes, row$to_geneID, row$from_geneID))
          covered_genes = unique(c(covered_genes, row$to_geneID))
        }
      }
    } else {
      dt[candidates, filter_criteria := method]
      # covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)], dt[candidates, unique(from_geneID)]))
      covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)]))
    }
  }
}

# After the loop, print checkpoint counts for method_MapMan4 assignments
print("MapMan4 assignment counts per method:")
## [1] "MapMan4 assignment counts per method:"
print(mapman4_counts)
## OrthoDB_MapMan4     RBH_MapMan4 FastOMA_MapMan4 
##            4773            1037            2709
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
table(dt$filter_criteria)
## 
##             compara     FastOMA_MapMan4     FastOMA_OrthoDB         FastOMA_RBH 
##                6150                2709                1553                 452 
##             MCScanX OrthoDB_FastOMA_RBH     OrthoDB_MapMan4         OrthoDB_RBH 
##               17647                 722                4773                 682 
##               PLAZA         RBH_MapMan4              reject 
##                5638                1037               42960
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
df = dt

data.table::fwrite(df, 
                   paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.txt'), 
                   sep = '\t')
openxlsx::write.xlsx(df, 
                     paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.xlsx'), 
                     asTable = TRUE)

4.9 Filtered

rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]


# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]

kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]





par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "df$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "kept$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = "df$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = "kept$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)

long_kept = data.table::rbindlist(lapply(methods, function(method) {
  kept[, .(
    Method = method,
    Match_Type = c("no match", "100% match based", "partial match"),
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
    )
  )]
}), use.names = TRUE)

long_kept[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]

ggplot2::ggplot(long_kept, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method (after filter)",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(kept), value = TRUE)] 
keptsub$MapMan4_Match = sub('based on.*', '', keptsub$MapMan4_Match)
table(keptsub$MapMan4_Match)
## 
##    100% match        no match partial match  
##          36016           3531           1816
table(keptsub$count_evidence, keptsub$MapMan4_Match)
##    
##     100% match  no match partial match 
##   1       10339     1978            856
##   2        5241      888            468
##   3        4936      342            179
##   4        5924      187            131
##   5        6393      102            126
##   6        3183       34             56
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))

tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))

ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Frequency of count_evidence by MapMan4_Match (after filter)",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria", 
                                     names(kept), value = TRUE)] 
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match', '100% match'))
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, levels = c("MCScanX", "compara", "PLAZA",
                                                    "OrthoDB_FastOMA_RBH",
                                                    "FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH", 
                                                    "OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
                                                    ))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', 'partial match', '100% match'))]


ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
  labs(
    title = "Frequency by MapMan4_Match (after filter)",
    x = "KG Criteria",
    y = "Frequency",
    fill = "MapMan4 Match"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    panel.border = element_rect(color = "black", fill = NA, size = 1),  # border around each facet
  )

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


openxlsx::write.xlsx(rejected, 
                     paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-removed_2025-09-15.xlsx'), 
                     asTable = TRUE)


edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
  geneID = names(comp$membership),
  weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
#  # cleanup
# kept[, c("from_component", "to_component") := NULL]


openxlsx::write.xlsx(kept, 
                     paste0('../output/y_', plantNameOut , '-ath_orthologues-kept_2025-09-15.xlsx'), 
                     asTable = TRUE)


if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) {  # make flags
  source_cols = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  source_cols = c("MCScanX", 'RBH', "FastOMA")
}

# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
  kept,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")

# Print or save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf")



cat('####  ####  after filter ####  ####  \n')
## ####  ####  after filter ####  ####
length(unique(kept$from_geneID))
## [1] 20066
length(unique(kept$to_geneID))
## [1] 22745
range(kept$from_count)
## [1]  1 34
range(kept$to_count)
## [1]  1 43
length(unique(kept$from_geneID[kept$from_count > 30]))
## [1] 3
length(unique(kept$to_geneID[kept$to_count > 30]))
## [1] 13
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####

4.10 PSS kept/rejected

pss_long = pss_long[, grep("id$|all_pathways$|short_name$", colnames(pss_long))]
pss_long = pss_long[!duplicated(pss_long), ]
pss_long = merge(pss_long, 
                 df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria", 
                                          names(dt), value = TRUE)],
                 by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss_long = pss_long[grep('^AT', pss_long$id), ]
pss_long = pss_long[!duplicated(pss_long), ]
table(pss_long$filter_criteria)
## 
##             compara     FastOMA_MapMan4     FastOMA_OrthoDB         FastOMA_RBH 
##                 184                 100                  68                  15 
##             MCScanX OrthoDB_FastOMA_RBH     OrthoDB_MapMan4         OrthoDB_RBH 
##                 789                  45                  78                  24 
##               PLAZA         RBH_MapMan4              reject 
##                 206                  22                1821
openxlsx::write.xlsx(pss_long, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-15.xlsx'), 
                     asTable = TRUE)
params_list <- list(
  plantName = 'vvi'
  , plantNameOut = "grapevine"
  , plantDirOut = file.path('..', 'reports', group, "grapevine")
  , flag = 1
)


env <- new.env()
list2env(params_list, envir = env)

<environment: 0x000001dac2f60548>

child_content <- knitr::knit_child("09_translate-child.Rmd", envir = env, quiet = FALSE)
## 
## 
## processing file: ./09_translate-child.Rmd

| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-67] | |…… | 11% | |…….. | 15% [unnamed-chunk-68] | |……… | 19% | |……….. | 22% [unnamed-chunk-69] | |…………. | 26% | |…………… | 30% [unnamed-chunk-70] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-71] | |………………… | 41% | |………………….. | 44% [unnamed-chunk-72] | |……………………. | 48% | |…………………….. | 52% [unnamed-chunk-73] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-74] | |………………………….. | 63% | |……………………………. | 67% [unnamed-chunk-75] | |……………………………… | 70% | |……………………………….. | 74% [unnamed-chunk-76] | |…………………………………. | 78% | |…………………………………… | 81% [unnamed-chunk-77] | |……………………………………. | 85% | |……………………………………… | 89% [unnamed-chunk-78] | |……………………………………….. | 93% | |…………………………………………. | 96% [unnamed-chunk-79] | |……………………………………………| 100%

cat(child_content)

5 Subsection: vvi

if (!dir.exists(plantDirOut)) dir.create(plantDirOut, recursive = TRUE)

5.1 Ortho sources

fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]

df = NULL

for (i in fl){
  
  print(i)
  
  dt = data.table::fread(i)
  us = unique(dt$source)
  
  if(us == 'compara') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'FastOMA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'MCScanX') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'PLAZA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'OrthoDB') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'RBH') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  }   else cat ('Ignore source(s):', us, '\n')
}
## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator 
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
table(df$source)
## 
## compara FastOMA MCScanX OrthoDB   PLAZA     RBH 
##   13825   61099   18589   49707   18488   26763
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID" 


df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 24 × 5
##    from_geneID from_plant to_geneID             to_plant source 
##    <chr>       <chr>      <chr>                 <chr>    <chr>  
##  1 ATCG00380   ath        Vitvi05_01chr00g00010 vvi      FastOMA
##  2 ATCG00780   ath        Vitvi05_01chr00g00040 vvi      FastOMA
##  3 AT4G20320   ath        Vitvi05_01chr19g23100 vvi      FastOMA
##  4 AT1G79580   ath        Vitvi05_01chr19g23110 vvi      FastOMA
##  5 AT1G10350   ath        Vitvi05_01chr01g13730 vvi      MCScanX
##  6 AT1G10370   ath        Vitvi05_01chr01g13610 vvi      MCScanX
##  7 ATMG01190   ath        Vitvi05_01chr10g19400 vvi      MCScanX
##  8 ATMG01280   ath        Vitvi05_01chr10g19500 vvi      MCScanX
##  9 AT1G44800   ath        Vitvi05_01chr18g04300 vvi      OrthoDB
## 10 AT1G21890   ath        Vitvi05_01chr18g04300 vvi      OrthoDB
## 11 AT2G07695   ath        Vitvi05_01chr00g07410 vvi      OrthoDB
## 12 AT2G07695   ath        Vitvi05_01chr10g19710 vvi      OrthoDB
## 13 AT3G20250   ath        Vitvi05_01chr09g20830 vvi      PLAZA  
## 14 AT4G25880   ath        Vitvi05_01chr09g20830 vvi      PLAZA  
## 15 AT3G27027   ath        Vitvi05_01chr17g01390 vvi      PLAZA  
## 16 AT3G01940   ath        Vitvi05_01chr17g01390 vvi      PLAZA  
## 17 AT1G01020   ath        Vitvi05_01chr10g07040 vvi      RBH    
## 18 AT1G01030   ath        Vitvi05_01chr02g03430 vvi      RBH    
## 19 ATMG01410   ath        Vitvi05_01chr00g07980 vvi      RBH    
## 20 ATMG01410   ath        Vitvi05_01chr10g19280 vvi      RBH    
## 21 AT2G07785   ath        Vitvi05_01chr10g19420 vvi      compara
## 22 AT2G07734   ath        Vitvi05_01chr00g07530 vvi      compara
## 23 AT5G47380   ath        Vitvi05_01chr19g16420 vvi      compara
## 24 AT1G79390   ath        Vitvi05_01chr19g22760 vvi      compara
df = df[!duplicated(df), ]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "gn", "sn", "pss_long", 
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut",
                          "pattern_in", 
                          "pattern_out", 
                          "mercator", 
                          "mercatorPatternIn1", 
                          "mercatorPatternOut1", 
                          "mercatorPatternIn2", 
                          "mercatorPatternOut2",
                          "flag")))




gc()
##             used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells   5253367 280.6   11303600  603.7  14129500  754.6
## Vcells 102582193 782.7  189394518 1445.0 189394518 1445.0

5.2 To wide format

dt = df
length(unique(dt$from_geneID))
## [1] 23140
length(unique(dt$to_geneID))
## [1] 24769
table(dt$source)
## 
## compara FastOMA MCScanX OrthoDB   PLAZA     RBH 
##   13825   61099   18589   49707   18488   26763
dt[, present := TRUE]

dt.wide = dcast(dt, from_geneID + to_geneID ~ source, value.var = "present", fill = FALSE)

dt.wide = dt.wide[order(dt.wide$from_geneID, dt.wide$to_geneID), ]

5.3 Upset plot

if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) {
  source_cols = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  source_cols = c("MCScanX", 'RBH', "FastOMA")
}


dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]

hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))

dff = as.data.frame(dt.wide)

upset_plot = upset(
  dff,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods") +
  theme(legend.position = "none")

# Print or/and save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf") # change name

5.4 Gene occurence

# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)

ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)), 
        grep('from_count', colnames(dt.wide)),
        grep('to_count', colnames(dt.wide)), 
        grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]

5.5 In/out PSS

df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)

df = merge(df, gn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) # 
df = merge(df, sn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) # 

df = merge(df, pss_long, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)

nin = pss_long[which(!(pss_long$id %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
nin = merge(nin, gn, by.x = 'id', by.y = 'V1', all.x = TRUE)
nin = merge(nin, sn, by.x = 'id', by.y = 'V1', all.x = TRUE)

openxlsx::write.xlsx(nin, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-15.xlsx'), 
                     asTable = TRUE) # change name

5.6 fruitTrees plant gmm

fp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]

colnames(combined)[2:4] = paste('fruitTrees', colnames(combined)[2:4], sep = '_')

colnames(df)
##  [1] "from_geneID"     "to_geneID"       "FastOMA"         "MCScanX"        
##  [5] "OrthoDB"         "PLAZA"           "RBH"             "compara"        
##  [9] "from_count"      "to_count"        "count_evidence"  "ath_BINCODE"    
## [13] "ath_NAME"        "ath_DESCRIPTION" "athName"         "athSynonims"    
## [17] "all_pathways"    "short_name"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$fruitTrees_BINCODE))
## 
## FALSE 
## 96387
dt[is.na(dt$fruitTrees_BINCODE), ]$to_geneID # check ones with strange ID
## character(0)
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]
rm(list = setdiff(ls(), c("df", 
                          "ath.gmm", "gn", "sn", "pss_long",  
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut", 
                          "pattern_in", 
                          "pattern_out", 
                          "mercator", 
                          "mercatorPatternIn1", 
                          "mercatorPatternOut1", 
                          "mercatorPatternIn2", 
                          "mercatorPatternOut2",
                          "flag")))


gc()
##             used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells   7174428 383.2   11262900  601.6  14129500  754.6
## Vcells 109255355 833.6  190913032 1456.6 190913032 1456.6

5.7 Translation table

MapMan Mercator matches: first three levels only

df = df[!duplicated(df), ]


compare_bin <- function(athMercator, plantXMercator) {
  # split string by | then by ; and trim tokens,
  # then truncate each token to first three dot-separated levels
  split_tokens = function(code) {
    if(is.na(code) || code == "") return(character(0))
    parts = stringr::str_split(code, "\\|", simplify = TRUE)
    tokens = unlist(lapply(parts, function(p) stringr::str_split(p, ";", simplify = TRUE)))
    tokens = unique(stringr::str_trim(tokens))
    
    # For each token, extract first 3 dot levels
    trunc3levels = function(token) {
      levels = unlist(stringr::str_split(token, "\\."))
      if(length(levels) > 3) {
        levels = levels[1:3]
      }
      paste(levels, collapse = ".")
    }
    
    truncated_tokens = sapply(tokens, trunc3levels)
    unique(truncated_tokens)
  }
  
  bin_set = split_tokens(athMercator)
  v4_set = split_tokens(plantXMercator)
  
  # Tokens that are common between sets truncated to 3 levels
  common_tokens = intersect(bin_set, v4_set)
  
  # Check if plantXMercator is exact duplication of athMercator token(s) (all plantXMercator tokens equal truncated bin_set token(s))
  v4_parts = stringr::str_split(plantXMercator, "\\|", simplify = TRUE)
  if(length(bin_set) == 1 &&
     length(v4_parts) > 1 &&
     all(split_tokens(plantXMercator) == bin_set)) {
    return(paste0("100% match based on ", bin_set))
  }
  
  # Check if sets are identical
  if(setequal(bin_set, v4_set)) {
    return(paste0("100% match based on ", paste(bin_set, collapse = ", ")))
  }
  
  # Partial match if any tokens overlap, mention those tokens
  if(length(common_tokens) > 0) {
    return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
  }
  
  return("no match")
}



df = df %>%
  dplyr::rowwise() %>%
  dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, fruitTrees_BINCODE)) %>% # change name 
  dplyr::ungroup()

5.8 Filter

# now

cat('####  ####  before filter ####  ####  \n')
## ####  ####  before filter ####  ####
length(unique(df$from_geneID))
## [1] 23140
length(unique(df$to_geneID))
## [1] 24769
range(df$from_count)
## [1]   1 162
range(df$to_count)
## [1]   1 115
length(unique(df$from_geneID[df$from_count > 30]))
## [1] 372
length(unique(df$to_geneID[df$to_count > 30]))
## [1] 239
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
covered_genes = character()


if (flag == 1) {
  methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) {  # make flags
  methods = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
  methods = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  methods = c("MCScanX", 'RBH', "FastOMA")
}


match_categories = c("no match", "100% match based", "partial match")

long_dt = data.table::rbindlist(lapply(methods, function(method) {
  dt[, .(
    Method = method,
    Match_Type = c("no match", "100% match based", "partial match"),
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
    )
  )]
}), use.names = TRUE)

long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]

ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match        no match partial match  
##          75529          13305           7553
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  no match partial match 
##   1       41087    11590           6714
##   2       11611     1082            552
##   3        6478      428            127
##   4        5958      109             83
##   5        6401       65             51
##   6        3994       31             26
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))

tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))

ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Frequency of count_evidence by MapMan4_Match",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")




if (flag != 4) {
  special_methods = c("OrthoDB", "RBH", "FastOMA")
} else {
  special_methods = c("RBH", "FastOMA")
}

# Initialize a named vector to count method_MapMan4 assignments
mapman4_counts = setNames(rep(0, length(special_methods)), paste0(special_methods, "_MapMan4"))

for (method in methods) {

  base_cond = dt$filter_criteria == "reject" & dt[[method]] == TRUE & 
               !(dt$to_geneID %in% covered_genes) & !(dt$from_geneID %in% covered_genes)
  add_cond = rep(TRUE, nrow(dt))
  
  if (method %in% special_methods) {
    add_cond = rep(TRUE, nrow(dt))
  }
  
  candidates = which(base_cond & add_cond)
  
  if (length(candidates) > 0) {
    if (method %in% special_methods) {
      for (i in candidates) {
        row = dt[i]
        covered_by = special_methods[sapply(special_methods, function(m) row[[m]] == TRUE)]
        count_covered = length(covered_by)
        
        is_candidate = FALSE
        new_criteria = NULL
        
        if (count_covered == 3) {
          is_candidate = TRUE
          new_criteria = "OrthoDB_FastOMA_RBH"
        } else if (count_covered == 2) {
          is_candidate = TRUE
          new_criteria = paste(sort(covered_by), collapse = "_")
        } else if (count_covered == 1) {
          # Check MapMan4_Match string contains "match based on" and method name (case-insensitive)
          # reconsider
          # (grepl("match based on", mapman_val, ignore.case = TRUE) &&
          #   !grepl("^100% match based on 35\\.2$", mapman_val)) # for flags 3
          if (grepl("match based on", row$MapMan4_Match, ignore.case = TRUE)) {
            is_candidate = TRUE
            new_criteria = paste0(method, "_MapMan4")
            
            # Increment count for this mapman4 assignment
            mapman4_counts[[new_criteria]] = mapman4_counts[[new_criteria]] + 1
          }
        }
        
        if (is_candidate) {
          dt[i, filter_criteria := new_criteria]
          # covered_genes = unique(c(covered_genes, row$to_geneID, row$from_geneID))
          covered_genes = unique(c(covered_genes, row$to_geneID))
        }
      }
    } else {
      dt[candidates, filter_criteria := method]
      # covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)], dt[candidates, unique(from_geneID)]))
      covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)]))
    }
  }
}

# After the loop, print checkpoint counts for method_MapMan4 assignments
print("MapMan4 assignment counts per method:")
## [1] "MapMan4 assignment counts per method:"
print(mapman4_counts)
## OrthoDB_MapMan4     RBH_MapMan4 FastOMA_MapMan4 
##            9694            1089            4300
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
table(dt$filter_criteria)
## 
##             compara     FastOMA_MapMan4     FastOMA_OrthoDB         FastOMA_RBH 
##                4089                4300                3100                 532 
##             MCScanX OrthoDB_FastOMA_RBH     OrthoDB_MapMan4         OrthoDB_RBH 
##               18589                1115                9694                1218 
##               PLAZA         RBH_MapMan4              reject 
##                4650                1089               48011
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####
df = dt

data.table::fwrite(df, 
                   paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.txt'), 
                   sep = '\t')
openxlsx::write.xlsx(df, 
                     paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.xlsx'), 
                     asTable = TRUE)

5.9 Filtered

rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]


# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]

kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]





par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "df$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "kept$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = "df$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = "kept$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)

long_kept = data.table::rbindlist(lapply(methods, function(method) {
  kept[, .(
    Method = method,
    Match_Type = c("no match", "100% match based", "partial match"),
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
    )
  )]
}), use.names = TRUE)

long_kept[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]

ggplot2::ggplot(long_kept, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method (after filter)",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(kept), value = TRUE)] 
keptsub$MapMan4_Match = sub('based on.*', '', keptsub$MapMan4_Match)
table(keptsub$MapMan4_Match)
## 
##    100% match        no match partial match  
##          43587           2504           2285
table(keptsub$count_evidence, keptsub$MapMan4_Match)
##    
##     100% match  no match partial match 
##   1       16019     1060           1693
##   2        7311      868            349
##   3        4861      385             91
##   4        5276       98             77
##   5        6126       62             49
##   6        3994       31             26
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))

tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))

ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Frequency of count_evidence by MapMan4_Match (after filter)",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria", 
                                     names(kept), value = TRUE)] 
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match', '100% match'))
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, levels = c("MCScanX", "compara", "PLAZA",
                                                    "OrthoDB_FastOMA_RBH",
                                                    "FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH", 
                                                    "OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
                                                    ))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', 'partial match', '100% match'))]


ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
  labs(
    title = "Frequency by MapMan4_Match (after filter)",
    x = "KG Criteria",
    y = "Frequency",
    fill = "MapMan4 Match"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    panel.border = element_rect(color = "black", fill = NA, size = 1),  # border around each facet
  )

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


openxlsx::write.xlsx(rejected, 
                     paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-removed_2025-09-15.xlsx'), 
                     asTable = TRUE)


edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
  geneID = names(comp$membership),
  weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
#  # cleanup
# kept[, c("from_component", "to_component") := NULL]


openxlsx::write.xlsx(kept, 
                     paste0('../output/y_', plantNameOut , '-ath_orthologues-kept_2025-09-15.xlsx'), 
                     asTable = TRUE)


if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) {  # make flags
  source_cols = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
  source_cols = c("MCScanX", 'RBH', "FastOMA")
}

# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
  kept,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")

# Print or save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf")



cat('####  ####  after filter ####  ####  \n')
## ####  ####  after filter ####  ####
length(unique(kept$from_geneID))
## [1] 20775
length(unique(kept$to_geneID))
## [1] 23682
range(kept$from_count)
## [1]   1 156
range(kept$to_count)
## [1]   1 102
length(unique(kept$from_geneID[kept$from_count > 30]))
## [1] 77
length(unique(kept$to_geneID[kept$to_count > 30]))
## [1] 26
cat('####  ####  ####  ####  \n')
## ####  ####  ####  ####

5.10 PSS kept/rejected

pss_long = pss_long[, grep("id$|all_pathways$|short_name$", colnames(pss_long))]
pss_long = pss_long[!duplicated(pss_long), ]
pss_long = merge(pss_long, 
                 df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria", 
                                          names(dt), value = TRUE)],
                 by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss_long = pss_long[grep('^AT', pss_long$id), ]
pss_long = pss_long[!duplicated(pss_long), ]
table(pss_long$filter_criteria)
## 
##             compara     FastOMA_MapMan4     FastOMA_OrthoDB         FastOMA_RBH 
##                 125                  82                 130                  39 
##             MCScanX OrthoDB_FastOMA_RBH     OrthoDB_MapMan4         OrthoDB_RBH 
##                 754                  68                 212                  36 
##               PLAZA         RBH_MapMan4              reject 
##                 238                  25                1234
openxlsx::write.xlsx(pss_long, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-15.xlsx'), 
                     asTable = TRUE)
# Step 1: params_list
# params_list <- list(
# ...
# )
# 
# Step 2: YAML header in 09_fruitTrees-child.Rmd
# ---
# title: "fruitTrees Child"
# output: html_document
# params:
#   plantName1: NULL
#   plantName2: NULL
#   plantName3: NULL
#   plantName4: NULL
#   plantDirIn: NULL
#   plantNameOut: NULL
#   plantDirOut: NULL
#   pattern_in: NULL
#   pattern_out: NULL
#   compara_pattern_in1: NULL
#   compara_pattern_in2: NULL
#   plaza_pattern_in1: NULL
#   plaza_pattern_in2: NULL
#   ref_genome: NULL
#   mercator: NULL
#   mercatorPatternIn1: NULL
#   mercatorPatternOut1: NULL
#   mercatorPatternIn2: NULL
#   mercatorPatternOut2: NULL
# ---
# 
# 
# access params in the script like:
# params$plantName1
# params$plantDirOut
# 
# Step 3: Call render() like
# rmarkdown::render(
#   input = "09_fruitTrees-child.Rmd",
#   params = params_list,
#   envir = new.env(parent = globalenv()),  # optional: isolate execution
#   output_format = "html_document",
#   quiet = FALSE
# )
# 
# 
# This will:
# Inject params_list into params$...
# Knit the child Rmd in a separate process
# Print progress to the console (quiet = FALSE)
# Save an .html file to the working directory

6 SessionInfo

sessionInfo()
## R version 4.5.1 (2025-06-13 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 10 x64 (build 19045)
## 
## Matrix products: default
##   LAPACK version 3.12.1
## 
## locale:
## [1] LC_COLLATE=Slovenian_Slovenia.utf8  LC_CTYPE=Slovenian_Slovenia.utf8   
## [3] LC_MONETARY=Slovenian_Slovenia.utf8 LC_NUMERIC=C                       
## [5] LC_TIME=Slovenian_Slovenia.utf8    
## 
## time zone: Europe/Warsaw
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] ComplexUpset_1.3.3 ggplot2_4.0.0      knitr_1.50         data.table_1.17.8 
## [5] magrittr_2.0.4    
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6       jsonlite_2.0.0     crayon_1.5.3       dplyr_1.1.4       
##  [5] compiler_4.5.1     zip_2.3.3          Rcpp_1.1.0         tidyselect_1.2.1  
##  [9] stringr_1.5.2      jquerylib_0.1.4    scales_1.4.0       yaml_2.3.10       
## [13] fastmap_1.2.0      R6_2.6.1           labeling_0.4.3     generics_0.1.4    
## [17] patchwork_1.3.2    igraph_2.1.4       openxlsx_4.2.8     tibble_3.3.0      
## [21] bslib_0.9.0        pillar_1.11.1      RColorBrewer_1.1-3 rlang_1.1.6       
## [25] utf8_1.2.6         cachem_1.1.0       stringi_1.8.7      xfun_0.53         
## [29] sass_0.4.10        S7_0.2.0           cli_3.6.5          withr_3.0.2       
## [33] digest_0.6.37      grid_4.5.1         rstudioapi_0.17.1  lifecycle_1.0.4   
## [37] vctrs_0.6.5        evaluate_1.0.5     glue_1.8.0         farver_2.1.2      
## [41] colorspace_2.1-1   rmarkdown_2.29     tools_4.5.1        pkgconfig_2.0.3   
## [45] htmltools_0.5.8.1